mirror of
https://github.com/searxng/searxng.git
synced 2025-01-25 17:48:07 +00:00
[mod] limiter -> botdetection: modularization and documentation
In order to be able to meet the outstanding requirements, the implementation is modularized and supplemented with documentation. This patch does not contain functional change, except it fixes issue #2455 ---- Aktivate limiter in the settings.yml and simulate a bot request by:: curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \ -H 'Accept: text/html' -H 'User-Agent: xyz' \ -H 'Accept-Encoding: gzip' \ 'http://127.0.0.1:8888/search?q=foo' In the LOG: DEBUG searx.botdetection.link_token : missing ping for this request: ..... Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time before you get a "Too Many Requests" response. Closes: https://github.com/searxng/searxng/issues/2455 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
5226044c13
commit
1ec325adcc
15 changed files with 541 additions and 161 deletions
|
@ -235,7 +235,7 @@ Global Settings
|
|||
|
||||
``limiter`` :
|
||||
Rate limit the number of request on the instance, block some bots. The
|
||||
:ref:`limiter plugin` requires a :ref:`settings redis` database.
|
||||
:ref:`limiter src` requires a :ref:`settings redis` database.
|
||||
|
||||
.. _image_proxy:
|
||||
|
||||
|
|
45
docs/src/searx.botdetection.rst
Normal file
45
docs/src/searx.botdetection.rst
Normal file
|
@ -0,0 +1,45 @@
|
|||
.. _botdetection:
|
||||
|
||||
=============
|
||||
Bot Detection
|
||||
=============
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
.. automodule:: searx.botdetection
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.limiter
|
||||
:members:
|
||||
|
||||
|
||||
Rate limit
|
||||
==========
|
||||
|
||||
.. automodule:: searx.botdetection.ip_limit
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.link_token
|
||||
:members:
|
||||
|
||||
|
||||
Probe HTTP headers
|
||||
==================
|
||||
|
||||
.. automodule:: searx.botdetection.http_accept
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_accept_encoding
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_accept_language
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_connection
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.botdetection.http_user_agent
|
||||
:members:
|
|
@ -1,13 +0,0 @@
|
|||
.. _limiter plugin:
|
||||
|
||||
==============
|
||||
Limiter Plugin
|
||||
==============
|
||||
|
||||
.. sidebar:: info
|
||||
|
||||
The :ref:`limiter plugin` requires a :ref:`Redis <settings redis>` database.
|
||||
|
||||
.. automodule:: searx.plugins.limiter
|
||||
:members:
|
||||
|
26
searx/botdetection/__init__.py
Normal file
26
searx/botdetection/__init__.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _botdetection src:
|
||||
|
||||
Bot detection methods
|
||||
---------------------
|
||||
|
||||
The methods implemented in this python package are use by the :ref:`limiter src`.
|
||||
|
||||
"""
|
||||
|
||||
import flask
|
||||
|
||||
|
||||
def dump_request(request: flask.Request):
|
||||
return (
|
||||
"%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path)
|
||||
+ " || form: %s" % request.form
|
||||
+ " || Accept: %s" % request.headers.get('Accept')
|
||||
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
|
||||
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
|
||||
+ " || Content-Type: %s" % request.headers.get('Content-Type')
|
||||
+ " || Content-Length: %s" % request.headers.get('Content-Length')
|
||||
+ " || Connection: %s" % request.headers.get('Connection')
|
||||
+ " || User-Agent: %s" % request.headers.get('User-Agent')
|
||||
)
|
24
searx/botdetection/http_accept.py
Normal file
24
searx/botdetection/http_accept.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept``
|
||||
----------------------
|
||||
|
||||
The ``http_accept`` method evaluates a request as the request of a bot if the
|
||||
Accept_ header ..
|
||||
|
||||
- did not contain ``text/html``
|
||||
|
||||
.. _Accept:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
|
||||
|
||||
"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
import flask
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
|
||||
if 'text/html' not in request.accept_mimetypes:
|
||||
return 429, "bot detected, HTTP header Accept did not contain text/html"
|
||||
return None
|
26
searx/botdetection/http_accept_encoding.py
Normal file
26
searx/botdetection/http_accept_encoding.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept_encoding``
|
||||
-------------------------------
|
||||
|
||||
The ``http_accept_encoding`` method evaluates a request as the request of a
|
||||
bot if the Accept-Encoding_ header ..
|
||||
|
||||
- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
|
||||
- did not contain ``text/html``
|
||||
|
||||
.. _Accept-Encoding:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
|
||||
|
||||
"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
import flask
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
|
||||
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
|
||||
if not ('gzip' in accept_list or 'deflate' in accept_list):
|
||||
return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate"
|
||||
return None
|
23
searx/botdetection/http_accept_language.py
Normal file
23
searx/botdetection/http_accept_language.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept_language``
|
||||
-------------------------------
|
||||
|
||||
The ``http_accept_language`` method evaluates a request as the request of a bot
|
||||
if the Accept-Language_ header is unset.
|
||||
|
||||
.. _Accept-Language:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from typing import Optional, Tuple
|
||||
import flask
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
|
||||
if request.headers.get('Accept-Language', '').strip() == '':
|
||||
return 429, "bot detected, missing HTTP header Accept-Language"
|
||||
return None
|
23
searx/botdetection/http_connection.py
Normal file
23
searx/botdetection/http_connection.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_connection``
|
||||
--------------------------
|
||||
|
||||
The ``http_connection`` method evaluates a request as the request of a bot if
|
||||
the Connection_ header is set to ``close``.
|
||||
|
||||
.. _Connection:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from typing import Optional, Tuple
|
||||
import flask
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
|
||||
if request.headers.get('Connection', '').strip() == 'close':
|
||||
return 429, "bot detected, HTTP header 'Connection=close'"
|
||||
return None
|
54
searx/botdetection/http_user_agent.py
Normal file
54
searx/botdetection/http_user_agent.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_user_agent``
|
||||
--------------------------
|
||||
|
||||
The ``http_user_agent`` method evaluates a request as the request of a bot if
|
||||
the User-Agent_ header is unset or matches the regular expression
|
||||
:py:obj:`USER_AGENT`.
|
||||
|
||||
.. _User-Agent:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
||||
|
||||
"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
import re
|
||||
import flask
|
||||
|
||||
USER_AGENT = (
|
||||
r'('
|
||||
+ r'unknown'
|
||||
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
|
||||
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
|
||||
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
|
||||
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
|
||||
+ r'|ZmEu|BLEXBot|bitlybot'
|
||||
# unmaintained Farside instances
|
||||
+ r'|'
|
||||
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
|
||||
# other bots and client to block
|
||||
+ '|.*PetalBot.*'
|
||||
+ r')'
|
||||
)
|
||||
"""Regular expression that matches to User-Agent_ from known *bots*"""
|
||||
|
||||
_regexp = None
|
||||
|
||||
|
||||
def regexp_user_agent():
|
||||
global _regexp # pylint: disable=global-statement
|
||||
if not _regexp:
|
||||
_regexp = re.compile(USER_AGENT)
|
||||
return _regexp
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
|
||||
user_agent = request.headers.get('User-Agent', 'unknown')
|
||||
if regexp_user_agent().match(user_agent):
|
||||
return (
|
||||
429,
|
||||
f"bot detected, HTTP header User-Agent: {user_agent}",
|
||||
)
|
||||
return None
|
90
searx/botdetection/ip_limit.py
Normal file
90
searx/botdetection/ip_limit.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
"""
|
||||
Method ``ip_limit``
|
||||
-------------------
|
||||
|
||||
The ``ip_limit`` method counts request from an IP in *sliding windows*. If
|
||||
there are to many requests in a sliding window, the request is evaluated as a
|
||||
bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_
|
||||
header. To take privacy only the hash value of an IP is stored in the redis DB
|
||||
and at least for a maximum of 10 minutes.
|
||||
|
||||
The :py:obj:`link_token` method is used to investigate whether a request is
|
||||
*suspicious*. If the :py:obj:`link_token` method is activated and a request is
|
||||
*suspicious* the request rates are reduced:
|
||||
|
||||
- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS`
|
||||
- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS`
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
import flask
|
||||
|
||||
from searx import redisdb
|
||||
from searx import logger
|
||||
from searx.redislib import incr_sliding_window
|
||||
|
||||
from . import link_token
|
||||
|
||||
logger = logger.getChild('botdetection.ip_limit')
|
||||
|
||||
BURST_WINDOW = 20
|
||||
"""Time (sec) before sliding window for *burst* requests expires."""
|
||||
|
||||
BURST_MAX = 15
|
||||
"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`"""
|
||||
|
||||
BURST_MAX_SUSPICIOUS = 2
|
||||
"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`"""
|
||||
|
||||
LONG_WINDOW = 600
|
||||
"""Time (sec) before the longer sliding window expires."""
|
||||
|
||||
LONG_MAX = 150
|
||||
"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`"""
|
||||
|
||||
LONG_MAX_SUSPICIOUS = 10
|
||||
"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
|
||||
|
||||
API_WONDOW = 3600
|
||||
"""Time (sec) before sliding window for API requests (format != html) expires."""
|
||||
|
||||
API_MAX = 4
|
||||
"""Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
|
||||
redis_client = redisdb.client()
|
||||
|
||||
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
|
||||
if not x_forwarded_for:
|
||||
logger.error("missing HTTP header X-Forwarded-For")
|
||||
|
||||
if request.args.get('format', 'html') != 'html':
|
||||
c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW)
|
||||
if c > API_MAX:
|
||||
return 429, "BLOCK %s: API limit exceeded"
|
||||
|
||||
suspicious = link_token.is_suspicious(request)
|
||||
|
||||
if suspicious:
|
||||
c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
|
||||
if c > BURST_MAX_SUSPICIOUS:
|
||||
return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS"
|
||||
|
||||
c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
|
||||
if c > LONG_MAX_SUSPICIOUS:
|
||||
return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS"
|
||||
|
||||
else:
|
||||
c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)
|
||||
if c > BURST_MAX:
|
||||
return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX"
|
||||
|
||||
c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW)
|
||||
if c > LONG_MAX:
|
||||
return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX"
|
||||
return None
|
79
searx/botdetection/limiter.py
Normal file
79
searx/botdetection/limiter.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _limiter src:
|
||||
|
||||
Limiter
|
||||
=======
|
||||
|
||||
.. sidebar:: info
|
||||
|
||||
The limiter requires a :ref:`Redis <settings redis>` database.
|
||||
|
||||
Bot protection / IP rate limitation. The intention of rate limitation is to
|
||||
limit suspicious requests from an IP. The motivation behind this is the fact
|
||||
that SearXNG passes through requests from bots and is thus classified as a bot
|
||||
itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked
|
||||
by the search engine (the origin) in some other way.
|
||||
|
||||
To avoid blocking, the requests from bots to SearXNG must also be blocked, this
|
||||
is the task of the limiter. To perform this task, the limiter uses the methods
|
||||
from the :py:obj:`searx.botdetection`.
|
||||
|
||||
To enable the limiter activate:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
server:
|
||||
...
|
||||
limiter: true # rate limit the number of request on the instance, block some bots
|
||||
|
||||
and set the redis-url connection. Check the value, it depends on your redis DB
|
||||
(see :ref:`settings redis`), by example:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
redis:
|
||||
url: unix:///usr/local/searxng-redis/run/redis.sock?db=0
|
||||
|
||||
"""
|
||||
|
||||
from typing import Optional, Tuple
|
||||
import flask
|
||||
|
||||
from searx.botdetection import (
|
||||
http_accept,
|
||||
http_accept_encoding,
|
||||
http_accept_language,
|
||||
http_connection,
|
||||
http_user_agent,
|
||||
ip_limit,
|
||||
)
|
||||
|
||||
|
||||
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
|
||||
|
||||
if request.path == '/healthz':
|
||||
return None
|
||||
|
||||
for func in [
|
||||
http_user_agent,
|
||||
]:
|
||||
val = func.filter_request(request)
|
||||
if val is not None:
|
||||
return val
|
||||
|
||||
if request.path == '/search':
|
||||
|
||||
for func in [
|
||||
http_accept,
|
||||
http_accept_encoding,
|
||||
http_accept_language,
|
||||
http_connection,
|
||||
http_user_agent,
|
||||
ip_limit,
|
||||
]:
|
||||
val = func.filter_request(request)
|
||||
if val is not None:
|
||||
return val
|
||||
|
||||
return None
|
126
searx/botdetection/link_token.py
Normal file
126
searx/botdetection/link_token.py
Normal file
|
@ -0,0 +1,126 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``link_token``
|
||||
---------------------
|
||||
|
||||
The ``link_token`` method evaluates a request as :py:obj:`suspicious
|
||||
<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
|
||||
client. By adding a random component (the token) in the URL a bot can not send
|
||||
a ping by request a static URL.
|
||||
|
||||
.. note::
|
||||
|
||||
This method requires a redis DB and needs a HTTP X-Forwarded-For_ header.
|
||||
|
||||
To get in use of this method a flask URL route needs to be added:
|
||||
|
||||
.. code:: python
|
||||
|
||||
@app.route('/client<token>.css', methods=['GET', 'POST'])
|
||||
def client_token(token=None):
|
||||
link_token.ping(request, token)
|
||||
return Response('', mimetype='text/css')
|
||||
|
||||
And in the HTML template from flask a stylesheet link is needed (the value of
|
||||
``link_token`` comes from :py:obj:`get_token`):
|
||||
|
||||
.. code:: html
|
||||
|
||||
<link rel="stylesheet"
|
||||
href="{{ url_for('client_token', token=link_token) }}"
|
||||
type="text/css" />
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
"""
|
||||
|
||||
import string
|
||||
import random
|
||||
import flask
|
||||
|
||||
from searx import logger
|
||||
from searx import redisdb
|
||||
from searx.redislib import secret_hash
|
||||
|
||||
TOKEN_LIVE_TIME = 600
|
||||
"""Livetime (sec) of limiter's CSS token."""
|
||||
|
||||
PING_KEY = 'SearXNG_limiter.ping'
|
||||
TOKEN_KEY = 'SearXNG_limiter.token'
|
||||
|
||||
logger = logger.getChild('botdetection.link_token')
|
||||
|
||||
|
||||
def is_suspicious(request: flask.Request):
|
||||
"""Checks if there is a valid ping for this request, if not this request is
|
||||
rated as *suspicious*"""
|
||||
redis_client = redisdb.client()
|
||||
if not redis_client:
|
||||
return False
|
||||
|
||||
ping_key = get_ping_key(request)
|
||||
if not redis_client.get(ping_key):
|
||||
logger.warning(
|
||||
"missing ping (IP: %s) / request: %s",
|
||||
request.headers.get('X-Forwarded-For', ''),
|
||||
ping_key,
|
||||
)
|
||||
return True
|
||||
|
||||
logger.debug("found ping for this request: %s", ping_key)
|
||||
return False
|
||||
|
||||
|
||||
def ping(request: flask.Request, token: str):
|
||||
"""This function is called by a request to URL ``/client<token>.css``"""
|
||||
redis_client = redisdb.client()
|
||||
if not redis_client:
|
||||
return
|
||||
if not token_is_valid(token):
|
||||
return
|
||||
ping_key = get_ping_key(request)
|
||||
logger.debug("store ping for: %s", ping_key)
|
||||
redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME)
|
||||
|
||||
|
||||
def get_ping_key(request: flask.Request):
|
||||
"""Generates a hashed key that fits (more or less) to a request. At least
|
||||
X-Forwarded-For_ is needed to be able to assign the request to an IP.
|
||||
|
||||
"""
|
||||
return secret_hash(
|
||||
PING_KEY
|
||||
+ request.headers.get('X-Forwarded-For', '')
|
||||
+ request.headers.get('Accept-Language', '')
|
||||
+ request.headers.get('User-Agent', '')
|
||||
)
|
||||
|
||||
|
||||
def token_is_valid(token) -> bool:
|
||||
valid = token == get_token()
|
||||
logger.debug("token is valid --> %s", valid)
|
||||
return valid
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
"""Returns current token. If there is no currently active token a new token
|
||||
is generated randomly and stored in the redis DB.
|
||||
|
||||
- :py:obj:`TOKEN_LIVE_TIME`
|
||||
- :py:obj:`TOKEN_KEY`
|
||||
|
||||
"""
|
||||
redis_client = redisdb.client()
|
||||
if not redis_client:
|
||||
# This function is also called when limiter is inactive / no redis DB
|
||||
# (see render function in webapp.py)
|
||||
return '12345678'
|
||||
token = redis_client.get(TOKEN_KEY)
|
||||
if token:
|
||||
token = token.decode('UTF-8')
|
||||
else:
|
||||
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
|
||||
redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME)
|
||||
return token
|
|
@ -1,165 +1,42 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
# pyright: basic
|
||||
"""Some bot protection / rate limitation
|
||||
"""see :ref:`limiter src`"""
|
||||
|
||||
To monitor rate limits and protect privacy the IP addresses are getting stored
|
||||
with a hash so the limiter plugin knows who to block. A redis database is
|
||||
needed to store the hash values.
|
||||
|
||||
Enable the plugin in ``settings.yml``:
|
||||
|
||||
- ``server.limiter: true``
|
||||
- ``redis.url: ...`` check the value, see :ref:`settings redis`
|
||||
"""
|
||||
|
||||
import re
|
||||
import string
|
||||
import random
|
||||
from flask import request
|
||||
import flask
|
||||
|
||||
from searx import redisdb
|
||||
from searx.plugins import logger
|
||||
from searx.redislib import incr_sliding_window, secret_hash
|
||||
from searx.botdetection import limiter
|
||||
from searx.botdetection import dump_request
|
||||
|
||||
name = "Request limiter"
|
||||
description = "Limit the number of request"
|
||||
default_on = False
|
||||
preference_section = 'service'
|
||||
|
||||
logger = logger.getChild('limiter')
|
||||
|
||||
block_user_agent = re.compile(
|
||||
r'('
|
||||
+ r'unknown'
|
||||
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
|
||||
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
|
||||
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
|
||||
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
|
||||
+ r'|ZmEu|BLEXBot|bitlybot'
|
||||
# unmaintained Farside instances
|
||||
+ r'|'
|
||||
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
|
||||
+ '|.*PetalBot.*'
|
||||
+ r')'
|
||||
)
|
||||
|
||||
PING_KEY = 'SearXNG_limiter.ping'
|
||||
TOKEN_KEY = 'SearXNG_limiter.token'
|
||||
|
||||
|
||||
def ping():
|
||||
redis_client = redisdb.client()
|
||||
user_agent = request.headers.get('User-Agent', 'unknown')
|
||||
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
|
||||
|
||||
ping_key = PING_KEY + user_agent + x_forwarded_for
|
||||
redis_client.set(secret_hash(ping_key), 1, ex=600)
|
||||
|
||||
|
||||
def get_token():
|
||||
redis_client = redisdb.client()
|
||||
if not redis_client:
|
||||
# This function is also called when limiter is inactive / no redis DB
|
||||
# (see render function in webapp.py)
|
||||
return '12345678'
|
||||
token = redis_client.get(TOKEN_KEY)
|
||||
if token:
|
||||
token = token.decode('UTF-8')
|
||||
else:
|
||||
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
|
||||
redis_client.set(TOKEN_KEY, token, ex=600)
|
||||
return token
|
||||
|
||||
|
||||
def token_is_valid(token):
|
||||
valid = token == get_token()
|
||||
logger.debug("token is valid --> %s", valid)
|
||||
return valid
|
||||
|
||||
|
||||
def is_accepted_request() -> bool:
|
||||
# pylint: disable=too-many-return-statements
|
||||
redis_client = redisdb.client()
|
||||
user_agent = request.headers.get('User-Agent', 'unknown')
|
||||
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
|
||||
|
||||
if request.path == '/healthz':
|
||||
return True
|
||||
|
||||
if block_user_agent.match(user_agent):
|
||||
logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent))
|
||||
return False
|
||||
|
||||
if request.path == '/search':
|
||||
|
||||
c_burst_max = 2
|
||||
c_10min_max = 10
|
||||
|
||||
ping_key = PING_KEY + user_agent + x_forwarded_for
|
||||
if redis_client.get(secret_hash(ping_key)):
|
||||
logger.debug('got a ping')
|
||||
c_burst_max = 15
|
||||
c_10min_max = 150
|
||||
else:
|
||||
logger.debug('missing a ping')
|
||||
|
||||
c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20)
|
||||
c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600)
|
||||
if c_burst > c_burst_max or c_10min > c_10min_max:
|
||||
logger.debug("BLOCK %s: too many request", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if len(request.headers.get('Accept-Language', '').strip()) == '':
|
||||
logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if request.headers.get('Connection') == 'close':
|
||||
logger.debug("BLOCK %s: got Connection=close", x_forwarded_for)
|
||||
return False
|
||||
|
||||
accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
|
||||
if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list:
|
||||
logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if 'text/html' not in request.accept_mimetypes:
|
||||
logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for)
|
||||
return False
|
||||
|
||||
if request.args.get('format', 'html') != 'html':
|
||||
c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600)
|
||||
if c > 4:
|
||||
logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for)
|
||||
return False
|
||||
|
||||
logger.debug(
|
||||
"OK %s: '%s'" % (x_forwarded_for, request.path)
|
||||
+ " || form: %s" % request.form
|
||||
+ " || Accept: %s" % request.headers.get('Accept', '')
|
||||
+ " || Accept-Language: %s" % request.headers.get('Accept-Language', '')
|
||||
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '')
|
||||
+ " || Content-Type: %s" % request.headers.get('Content-Type', '')
|
||||
+ " || Content-Length: %s" % request.headers.get('Content-Length', '')
|
||||
+ " || Connection: %s" % request.headers.get('Connection', '')
|
||||
+ " || User-Agent: %s" % user_agent
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def pre_request():
|
||||
if not is_accepted_request():
|
||||
return 'Too Many Requests', 429
|
||||
"""See :ref:`flask.Flask.before_request`"""
|
||||
|
||||
val = limiter.filter_request(flask.request)
|
||||
if val is not None:
|
||||
http_status, msg = val
|
||||
client_ip = flask.request.headers.get('X-Forwarded-For', '<unknown>')
|
||||
logger.error("BLOCK (IP %s): %s" % (client_ip, msg))
|
||||
return 'Too Many Requests', http_status
|
||||
|
||||
logger.debug("OK: %s" % dump_request(flask.request))
|
||||
return None
|
||||
|
||||
|
||||
def init(app, settings):
|
||||
def init(app: flask.Flask, settings) -> bool:
|
||||
if not settings['server']['limiter']:
|
||||
return False
|
||||
|
||||
if not redisdb.client():
|
||||
logger.error("The limiter requires Redis") # pylint: disable=undefined-variable
|
||||
logger.error("The limiter requires Redis")
|
||||
return False
|
||||
|
||||
app.before_request(pre_request)
|
||||
return True
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
<link rel="stylesheet" href="{{ url_for('static', filename='css/searxng.min.css') }}" type="text/css" media="screen" />
|
||||
{% endif %}
|
||||
{% if get_setting('server.limiter') %}
|
||||
<link rel="stylesheet" href="{{ url_for('limiter_css', token=limiter_token) }}" type="text/css" media="screen" />
|
||||
<link rel="stylesheet" href="{{ url_for('client_token', token=link_token) }}" type="text/css" />
|
||||
{% endif %}
|
||||
{% block styles %}{% endblock %}
|
||||
<!--[if gte IE 9]>-->
|
||||
|
|
|
@ -93,7 +93,8 @@ from searx.utils import (
|
|||
)
|
||||
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
|
||||
from searx.query import RawTextQuery
|
||||
from searx.plugins import limiter, Plugin, plugins, initialize as plugin_initialize
|
||||
from searx.plugins import Plugin, plugins, initialize as plugin_initialize
|
||||
from searx.botdetection import link_token
|
||||
from searx.plugins.oa_doi_rewrite import get_doi_resolver
|
||||
from searx.preferences import (
|
||||
Preferences,
|
||||
|
@ -416,7 +417,7 @@ def render(template_name: str, **kwargs):
|
|||
kwargs['endpoint'] = 'results' if 'q' in kwargs else request.endpoint
|
||||
kwargs['cookies'] = request.cookies
|
||||
kwargs['errors'] = request.errors
|
||||
kwargs['limiter_token'] = limiter.get_token()
|
||||
kwargs['link_token'] = link_token.get_token()
|
||||
|
||||
# values from the preferences
|
||||
kwargs['preferences'] = request.preferences
|
||||
|
@ -643,10 +644,9 @@ def health():
|
|||
return Response('OK', mimetype='text/plain')
|
||||
|
||||
|
||||
@app.route('/limiter<token>.css', methods=['GET', 'POST'])
|
||||
def limiter_css(token=None):
|
||||
if limiter.token_is_valid(token):
|
||||
limiter.ping()
|
||||
@app.route('/client<token>.css', methods=['GET', 'POST'])
|
||||
def client_token(token=None):
|
||||
link_token.ping(request, token)
|
||||
return Response('', mimetype='text/css')
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue