# SPDX-License-Identifier: AGPL-3.0-or-later """ Method ``http_user_agent`` -------------------------- The ``http_user_agent`` method evaluates a request as the request of a bot if the User-Agent_ header is unset or matches the regular expression :py:obj:`USER_AGENT`. .. _User-Agent: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent """ # pylint: disable=unused-argument from __future__ import annotations import re from ipaddress import ( IPv4Network, IPv6Network, ) import flask import werkzeug from . import config from ._helpers import too_many_requests USER_AGENT = ( r'(' + r'unknown' + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + r'|ZmEu|BLEXBot|bitlybot|HeadlessChrome' # unmaintained Farside instances + r'|' + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') # other bots and client to block + '|.*PetalBot.*' + r')' ) """Regular expression that matches to User-Agent_ from known *bots*""" _regexp = None def regexp_user_agent(): global _regexp # pylint: disable=global-statement if not _regexp: _regexp = re.compile(USER_AGENT) return _regexp def filter_request( network: IPv4Network | IPv6Network, request: flask.Request, cfg: config.Config, ) -> werkzeug.Response | None: user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") return None