From 3786920df975b11c0feb7d8564eb19b634d32977 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Mon, 16 Nov 2020 12:44:07 +0100 Subject: [PATCH] [enh] Add multiple outgoing proxies credits go to @bauruine see https://github.com/searx/searx/pull/1958 --- docs/admin/settings.rst | 48 ++++++++++++------ searx/engines/__init__.py | 18 ++++--- searx/poolrequests.py | 30 ++++++++++- searx/search.py | 2 +- searx/settings.yml | 16 +++--- searx/webapp.py | 5 +- tests/unit/test_poolrequests.py | 89 +++++++++++++++++++++++++++++++++ 7 files changed, 172 insertions(+), 36 deletions(-) create mode 100644 tests/unit/test_poolrequests.py diff --git a/docs/admin/settings.rst b/docs/admin/settings.rst index 17dee4da8..58bce3541 100644 --- a/docs/admin/settings.rst +++ b/docs/admin/settings.rst @@ -36,18 +36,26 @@ Global Settings image_proxy : False # proxying image results through searx default_locale : "" # default interface locale - # uncomment below section if you want to use a proxy + outgoing: # communication with search engines + request_timeout : 2.0 # default timeout in seconds, can be override by engine + # max_request_timeout: 10.0 # the maximum timeout in seconds + useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator + pool_connections : 100 # Number of different hosts + pool_maxsize : 10 # Number of simultaneous requests by host - #outgoing_proxies : - # http : http://127.0.0.1:8080 - # https: http://127.0.0.1:8080 + #proxies: + # http: + # - http://proxy1:8080 + # - http://proxy2:8080 + # https: + # - http://proxy1:8080 + # - http://proxy2:8080 + # - socks5://user:password@proxy3:1080 + # - socks5h://user:password@proxy4:1080 - # uncomment below section only if you have more than one network interface - # which can be the source of outgoing search requests - - #source_ips: - # - 1.1.1.1 - # - 1.1.1.2 + #source_ips: + # - 1.1.1.1 + # - 1.1.1.2 locales: en : English @@ -105,15 +113,16 @@ Global Settings code, like ``fr``, ``en``, ``de``. .. _requests proxies: http://requests.readthedocs.io/en/latest/user/advanced/#proxies -.. _PR SOCKS support: https://github.com/kennethreitz/requests/pull/478 +.. _PySocks: https://pypi.org/project/PySocks/ -``outgoing_proxies`` : - Define a proxy you wish to use, see `requests proxies`_. SOCKS proxies are - not supported / see `PR SOCKS support`. +``proxies`` : + Define one or more proxies you wish to use, see `requests proxies`_. + If there are more than one proxy for one protocol (http, https), + requests to the engines are distributed in a round-robin fashion. ``source_ips`` : If you use multiple network interfaces, define from which IP the requests must - be made. + be made. This parameter is ignored when ``proxies`` is set. ``locales`` : Locales codes and their names. Available translations of searx interface. @@ -139,6 +148,15 @@ Engine settings api_key : 'apikey' disabled : True language : en_US + #proxies: + # http: + # - http://proxy1:8080 + # - http://proxy2:8080 + # https: + # - http://proxy1:8080 + # - http://proxy2:8080 + # - socks5://user:password@proxy3:1080 + # - socks5h://user:password@proxy4:1080 ``name`` : Name that will be used across searx to define this engine. In settings, on diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 00be89412..a80afb1c6 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -25,7 +25,7 @@ from operator import itemgetter from searx import settings from searx import logger from searx.data import ENGINES_LANGUAGES -from searx.poolrequests import get +from searx.poolrequests import get, get_proxy_cycles from searx.utils import load_module, match_language, get_engine_from_settings @@ -79,16 +79,18 @@ def load_engine(engine_data): logger.exception('Cannot load engine "{}"'.format(engine_module)) return None - for param_name in engine_data: + for param_name, param_value in engine_data.items(): if param_name == 'engine': - continue - if param_name == 'categories': - if engine_data['categories'] == 'none': + pass + elif param_name == 'categories': + if param_value == 'none': engine.categories = [] else: - engine.categories = list(map(str.strip, engine_data['categories'].split(','))) - continue - setattr(engine, param_name, engine_data[param_name]) + engine.categories = list(map(str.strip, param_value.split(','))) + elif param_name == 'proxies': + engine.proxies = get_proxy_cycles(param_value) + else: + setattr(engine, param_name, param_value) for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): diff --git a/searx/poolrequests.py b/searx/poolrequests.py index e03797ce2..1eedc84b8 100644 --- a/searx/poolrequests.py +++ b/searx/poolrequests.py @@ -111,6 +111,32 @@ def get_time_for_thread(): return threadLocal.total_time +def get_proxy_cycles(proxy_settings): + if not proxy_settings: + return None + # Backwards compatibility for single proxy in settings.yml + for protocol, proxy in proxy_settings.items(): + if isinstance(proxy, str): + proxy_settings[protocol] = [proxy] + + for protocol in proxy_settings: + proxy_settings[protocol] = cycle(proxy_settings[protocol]) + return proxy_settings + + +GLOBAL_PROXY_CYCLES = get_proxy_cycles(settings['outgoing'].get('proxies')) + + +def get_proxies(proxy_cycles): + if proxy_cycles: + return {protocol: next(proxy_cycle) for protocol, proxy_cycle in proxy_cycles.items()} + return None + + +def get_global_proxies(): + return get_proxies(GLOBAL_PROXY_CYCLES) + + def request(method, url, **kwargs): """same as requests/requests/api.py request(...)""" time_before_request = time() @@ -119,8 +145,8 @@ def request(method, url, **kwargs): session = SessionSinglePool() # proxies - if kwargs.get('proxies') is None: - kwargs['proxies'] = settings['outgoing'].get('proxies') + if not kwargs.get('proxies'): + kwargs['proxies'] = get_global_proxies() # timeout if 'timeout' in kwargs: diff --git a/searx/search.py b/searx/search.py index 1cb2a603b..b8ada3901 100644 --- a/searx/search.py +++ b/searx/search.py @@ -119,7 +119,7 @@ def send_http_request(engine, request_params): # setting engine based proxies if hasattr(engine, 'proxies'): - request_args['proxies'] = engine.proxies + request_args['proxies'] = requests_lib.get_proxies(engine.proxies) # specific type of request (GET or POST) if request_params['method'] == 'GET': diff --git a/searx/settings.yml b/searx/settings.yml index 78ae26b97..8af1a17f1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -63,13 +63,15 @@ outgoing: # communication with search engines pool_connections : 100 # Number of different hosts pool_maxsize : 10 # Number of simultaneous requests by host # uncomment below section if you want to use a proxy -# see http://docs.python-requests.org/en/latest/user/advanced/#proxies -# SOCKS proxies are also supported: see http://requests.readthedocs.io/en/master/user/advanced/#socks -# proxies : -# http : socks5h://127.0.0.1:9050 -# https: socks5h://127.0.0.1:9050 -# using_tor_proxy : True -# extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy +# see https://2.python-requests.org/en/latest/user/advanced/#proxies +# SOCKS proxies are also supported: see https://2.python-requests.org/en/latest/user/advanced/#socks +# proxies: +# http: +# - http://proxy1:8080 +# - http://proxy2:8080 +# https: +# - http://proxy1:8080 +# - http://proxy2:8080 # uncomment below section only if you have more than one network interface # which can be the source of outgoing search requests # source_ips: diff --git a/searx/webapp.py b/searx/webapp.py index 2a93c3765..26416c5aa 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -78,6 +78,7 @@ from searx.plugins import plugins from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES from searx.answerers import answerers +from searx.poolrequests import get_global_proxies # serve pages with HTTP/1.1 @@ -149,8 +150,6 @@ _category_names = (gettext('files'), gettext('onions'), gettext('science')) -outgoing_proxies = settings['outgoing'].get('proxies') or None - _flask_babel_get_translations = flask_babel.get_translations @@ -905,7 +904,7 @@ def image_proxy(): stream=True, timeout=settings['outgoing']['request_timeout'], headers=headers, - proxies=outgoing_proxies) + proxies=get_global_proxies()) if resp.status_code == 304: return '', resp.status_code diff --git a/tests/unit/test_poolrequests.py b/tests/unit/test_poolrequests.py new file mode 100644 index 000000000..b22685fd0 --- /dev/null +++ b/tests/unit/test_poolrequests.py @@ -0,0 +1,89 @@ +from unittest.mock import patch +from requests.models import Response + +from searx.testing import SearxTestCase + +import searx.poolrequests +from searx.poolrequests import get_proxy_cycles, get_proxies + + +CONFIG = {'http': ['http://localhost:9090', 'http://localhost:9092'], + 'https': ['http://localhost:9091', 'http://localhost:9093']} + + +class TestProxy(SearxTestCase): + + def test_noconfig(self): + cycles = get_proxy_cycles(None) + self.assertIsNone(cycles) + + cycles = get_proxy_cycles(False) + self.assertIsNone(cycles) + + def test_oldconfig(self): + config = { + 'http': 'http://localhost:9090', + 'https': 'http://localhost:9091', + } + cycles = get_proxy_cycles(config) + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + + def test_one_proxy(self): + config = { + 'http': ['http://localhost:9090'], + 'https': ['http://localhost:9091'], + } + cycles = get_proxy_cycles(config) + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + + def test_multiple_proxies(self): + cycles = get_proxy_cycles(CONFIG) + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['http']), 'http://localhost:9092') + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + self.assertEqual(next(cycles['https']), 'http://localhost:9093') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + + def test_getproxies_none(self): + self.assertIsNone(get_proxies(None)) + + def test_getproxies_config(self): + cycles = get_proxy_cycles(CONFIG) + self.assertEqual(get_proxies(cycles), { + 'http': 'http://localhost:9090', + 'https': 'http://localhost:9091' + }) + self.assertEqual(get_proxies(cycles), { + 'http': 'http://localhost:9092', + 'https': 'http://localhost:9093' + }) + + @patch('searx.poolrequests.get_global_proxies') + def test_request(self, mock_get_global_proxies): + method = 'GET' + url = 'http://localhost' + custom_proxies = { + 'https': 'http://localhost:1080' + } + global_proxies = { + 'http': 'http://localhost:9092', + 'https': 'http://localhost:9093' + } + mock_get_global_proxies.return_value = global_proxies + + # check the global proxies usage + with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method: + searx.poolrequests.request(method, url) + mock_method.assert_called_once_with(method=method, url=url, proxies=global_proxies) + + # check if the proxies parameter overrides the global proxies + with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method: + searx.poolrequests.request(method, url, proxies=custom_proxies) + mock_method.assert_called_once_with(method=method, url=url, proxies=custom_proxies)