mirror of
https://github.com/searxng/searxng.git
synced 2024-11-29 14:11:02 +00:00
fd814aac86
This patch was inspired by the discussion around PR-2882 [2]. The goals of this patch are: 1. Convert plugin searx.plugin.limiter to normal code [1] 2. isolation of botdetection from the limiter [2] 3. searx/{tools => botdetection}/config.py and drop searx.tools 4. in URL /config, 'limiter.enabled' is true only if the limiter is really enabled (Redis is available). This patch moves all the code that belongs to botdetection into namespace searx.botdetection and code that belongs to limiter is placed in namespace searx.limiter. Tthe limiter used to be a plugin at some point botdetection was added, it was not a plugin. The modularization of these two components was long overdue. With the clear modularization, the documentation could then also be organized according to the architecture. [1] https://github.com/searxng/searxng/pull/2882 [2] https://github.com/searxng/searxng/pull/2882#issuecomment-1741716891 To test: - check the app works without the limiter, check `/config` - check the app works with the limiter and with the token, check `/config` - make docs.live .. and read - http://0.0.0.0:8000/admin/searx.limiter.html - http://0.0.0.0:8000/src/searx.botdetection.html#botdetection Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
# lint: pylint
|
|
"""
|
|
Method ``http_user_agent``
|
|
--------------------------
|
|
|
|
The ``http_user_agent`` method evaluates a request as the request of a bot if
|
|
the User-Agent_ header is unset or matches the regular expression
|
|
:py:obj:`USER_AGENT`.
|
|
|
|
.. _User-Agent:
|
|
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
|
|
|
"""
|
|
# pylint: disable=unused-argument
|
|
|
|
from __future__ import annotations
|
|
import re
|
|
from ipaddress import (
|
|
IPv4Network,
|
|
IPv6Network,
|
|
)
|
|
|
|
import flask
|
|
import werkzeug
|
|
|
|
from . import config
|
|
from ._helpers import too_many_requests
|
|
|
|
|
|
USER_AGENT = (
|
|
r'('
|
|
+ r'unknown'
|
|
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
|
|
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
|
|
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
|
|
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
|
|
+ r'|ZmEu|BLEXBot|bitlybot'
|
|
# unmaintained Farside instances
|
|
+ r'|'
|
|
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
|
|
# other bots and client to block
|
|
+ '|.*PetalBot.*'
|
|
+ r')'
|
|
)
|
|
"""Regular expression that matches to User-Agent_ from known *bots*"""
|
|
|
|
_regexp = None
|
|
|
|
|
|
def regexp_user_agent():
|
|
global _regexp # pylint: disable=global-statement
|
|
if not _regexp:
|
|
_regexp = re.compile(USER_AGENT)
|
|
return _regexp
|
|
|
|
|
|
def filter_request(
|
|
network: IPv4Network | IPv6Network,
|
|
request: flask.Request,
|
|
cfg: config.Config,
|
|
) -> werkzeug.Response | None:
|
|
|
|
user_agent = request.headers.get('User-Agent', 'unknown')
|
|
if regexp_user_agent().match(user_agent):
|
|
return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
|
|
return None
|