From d703119d3a313a406482b121ee94c6afee3bc307 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Wed, 9 Dec 2020 21:23:20 +0100 Subject: [PATCH] [enh] add raise_for_httperror check HTTP response: * detect some comme CAPTCHA challenge (no solving). In this case the engine is suspended for long a time. * otherwise raise HTTPError as before the check is done in poolrequests.py (was before in search.py). update qwant, wikipedia, wikidata to use raise_for_httperror instead of raise_for_status --- docs/dev/engine_overview.rst | 26 ++++++------ searx/engines/__init__.py | 8 +++- searx/engines/qwant.py | 27 +++++++------ searx/engines/wikidata.py | 3 -- searx/engines/wikipedia.py | 4 +- searx/exceptions.py | 29 +++++++++++++- searx/metrology/error_recorder.py | 7 +++- searx/poolrequests.py | 11 ++++++ searx/raise_for_httperror.py | 66 +++++++++++++++++++++++++++++++ searx/search.py | 36 +++++++++++------ searx/settings.yml | 18 ++++----- 11 files changed, 179 insertions(+), 56 deletions(-) create mode 100644 searx/raise_for_httperror.py diff --git a/docs/dev/engine_overview.rst b/docs/dev/engine_overview.rst index 99726a456..3562ca61a 100644 --- a/docs/dev/engine_overview.rst +++ b/docs/dev/engine_overview.rst @@ -134,19 +134,19 @@ The function ``def request(query, params):`` always returns the ``params`` variable. Inside searx, the following paramters can be used to specify a search request: -================== =========== ========================================================================== -argument type information -================== =========== ========================================================================== -url string requested url -method string HTTP request method -headers set HTTP header information -data set HTTP data information (parsed if ``method != 'GET'``) -cookies set HTTP cookies -verify boolean Performing SSL-Validity check -max_redirects int maximum redirects, hard limit -soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine -raise_for_status bool True by default: raise an exception if the HTTP code of response is >= 300 -================== =========== ========================================================================== +=================== =========== ========================================================================== +argument type information +=================== =========== ========================================================================== +url string requested url +method string HTTP request method +headers set HTTP header information +data set HTTP data information (parsed if ``method != 'GET'``) +cookies set HTTP cookies +verify boolean Performing SSL-Validity check +max_redirects int maximum redirects, hard limit +soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine +raise_for_httperror bool True by default: raise an exception if the HTTP code of response is >= 300 +=================== =========== ========================================================================== example code diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 0b77f2a95..b2a9b25a4 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -281,8 +281,12 @@ def initialize_engines(engine_list): load_engines(engine_list) def engine_init(engine_name, init_fn): - init_fn(get_engine_from_settings(engine_name)) - logger.debug('%s engine: Initialized', engine_name) + try: + init_fn(get_engine_from_settings(engine_name)) + except Exception: + logger.exception('%s engine: Fail to initialize', engine_name) + else: + logger.debug('%s engine: Initialized', engine_name) for engine_name, engine in engines.items(): if hasattr(engine, 'init'): diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index c909ce11b..b785719d9 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -14,6 +14,8 @@ from datetime import datetime from json import loads from urllib.parse import urlencode from searx.utils import html_to_text, match_language +from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException +from searx.raise_for_httperror import raise_for_httperror # engine dependent config @@ -24,8 +26,7 @@ supported_languages_url = 'https://qwant.com/region' category_to_keyword = {'general': 'web', 'images': 'images', - 'news': 'news', - 'social media': 'social'} + 'news': 'news'} # search-url url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' @@ -51,6 +52,7 @@ def request(query, params): params['url'] += '&locale=' + language.replace('-', '_').lower() params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' + params['raise_for_httperror'] = False return params @@ -58,8 +60,20 @@ def request(query, params): def response(resp): results = [] + # According to https://www.qwant.com/js/app.js + if resp.status_code == 429: + raise SearxEngineCaptchaException() + + # raise for other errors + raise_for_httperror(resp) + + # load JSON result search_results = loads(resp.text) + # check for an API error + if search_results.get('status') != 'success': + raise SearxEngineAPIException('API error ' + str(search_results.get('error', ''))) + # return empty array if there are no results if 'data' not in search_results: return [] @@ -90,15 +104,6 @@ def response(resp): 'thumbnail_src': thumbnail_src, 'img_src': img_src}) - elif category_to_keyword.get(categories[0], '') == 'social': - published_date = datetime.fromtimestamp(result['date'], None) - img_src = result.get('img', None) - results.append({'url': res_url, - 'title': title, - 'publishedDate': published_date, - 'content': content, - 'img_src': img_src}) - elif category_to_keyword.get(categories[0], '') == 'news': published_date = datetime.fromtimestamp(result['date'], None) media = result.get('media', []) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 60d0dc9a0..8d787caac 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -161,9 +161,6 @@ def request(query, params): def response(resp): results = [] - if resp.status_code != 200: - logger.debug('SPARQL endpoint error %s', resp.content.decode()) - resp.raise_for_status() jsonresponse = loads(resp.content.decode()) language = resp.search_params['language'].lower() diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 000e1af76..54d75108e 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -14,6 +14,7 @@ from urllib.parse import quote from json import loads from lxml.html import fromstring from searx.utils import match_language, searx_useragent +from searx.raise_for_httperror import raise_for_httperror # search-url search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' @@ -37,7 +38,7 @@ def request(query, params): language=url_lang(params['language'])) params['headers']['User-Agent'] = searx_useragent() - params['raise_for_status'] = False + params['raise_for_httperror'] = False params['soft_max_redirects'] = 2 return params @@ -47,6 +48,7 @@ def request(query, params): def response(resp): if resp.status_code == 404: return [] + raise_for_httperror(resp) results = [] api_result = loads(resp.text) diff --git a/searx/exceptions.py b/searx/exceptions.py index 82c1d76dc..67a282da2 100644 --- a/searx/exceptions.py +++ b/searx/exceptions.py @@ -64,8 +64,33 @@ class SearxEngineAPIException(SearxEngineResponseException): """The website has returned an application error""" -class SearxEngineCaptchaException(SearxEngineResponseException): - """The website has returned a CAPTCHA""" +class SearxEngineAccessDeniedException(SearxEngineResponseException): + """The website is blocking the access""" + + def __init__(self, suspended_time=24 * 3600, message='Access denied'): + super().__init__(message + ', suspended_time=' + str(suspended_time)) + self.suspended_time = suspended_time + self.message = message + + +class SearxEngineCaptchaException(SearxEngineAccessDeniedException): + """The website has returned a CAPTCHA + + By default, searx stops sending requests to this engine for 1 day. + """ + + def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'): + super().__init__(message=message, suspended_time=suspended_time) + + +class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException): + """The website has returned a Too Many Request status code + + By default, searx stops sending requests to this engine for 1 hour. + """ + + def __init__(self, suspended_time=3600, message='Too many request'): + super().__init__(message=message, suspended_time=suspended_time) class SearxEngineXPathException(SearxEngineResponseException): diff --git a/searx/metrology/error_recorder.py b/searx/metrology/error_recorder.py index 4b67235e1..fee1ef7d6 100644 --- a/searx/metrology/error_recorder.py +++ b/searx/metrology/error_recorder.py @@ -4,7 +4,8 @@ import logging from json import JSONDecodeError from urllib.parse import urlparse from requests.exceptions import RequestException -from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException +from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException, + SearxEngineAccessDeniedException) from searx import logger @@ -100,6 +101,10 @@ def get_messages(exc, filename) -> typing.Tuple: return (exc.xpath_str, exc.message) if isinstance(exc, SearxEngineXPathException): return (exc.xpath_str, exc.message) + if isinstance(exc, SearxEngineAPIException): + return (str(exc.args[0]), ) + if isinstance(exc, SearxEngineAccessDeniedException): + return (exc.message, ) return () diff --git a/searx/poolrequests.py b/searx/poolrequests.py index 1eedc84b8..25a6baed9 100644 --- a/searx/poolrequests.py +++ b/searx/poolrequests.py @@ -7,6 +7,7 @@ import requests from searx import settings from searx import logger +from searx.raise_for_httperror import raise_for_httperror logger = logger.getChild('poolrequests') @@ -156,6 +157,12 @@ def request(method, url, **kwargs): if timeout is not None: kwargs['timeout'] = timeout + # raise_for_error + check_for_httperror = True + if 'raise_for_httperror' in kwargs: + check_for_httperror = kwargs['raise_for_httperror'] + del kwargs['raise_for_httperror'] + # do request response = session.request(method=method, url=url, **kwargs) @@ -176,6 +183,10 @@ def request(method, url, **kwargs): if hasattr(threadLocal, 'total_time'): threadLocal.total_time += time_after_request - time_before_request + # raise an exception + if check_for_httperror: + raise_for_httperror(response) + return response diff --git a/searx/raise_for_httperror.py b/searx/raise_for_httperror.py new file mode 100644 index 000000000..bd12df9a9 --- /dev/null +++ b/searx/raise_for_httperror.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +Raise exception for an HTTP response is an error. +""" +from searx.exceptions import (SearxEngineCaptchaException, SearxEngineTooManyRequestsException, + SearxEngineAccessDeniedException) + + +def is_cloudflare_challenge(resp): + if resp.status_code in [429, 503]: + if ('__cf_chl_jschl_tk__=' in resp.text)\ + or ('/cdn-cgi/challenge-platform/' in resp.text + and 'orchestrate/jsch/v1' in resp.text + and 'window._cf_chl_enter(' in resp.text): + return True + if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text: + return True + return False + + +def is_cloudflare_firewall(resp): + return resp.status_code == 403 and '1020' in resp.text + + +def raise_for_cloudflare_captcha(resp): + if resp.headers.get('Server', '').startswith('cloudflare'): + if is_cloudflare_challenge(resp): + # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha- + # suspend for 2 weeks + raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15) + + if is_cloudflare_firewall(resp): + raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24) + + +def raise_for_recaptcha(resp): + if resp.status_code == 503 \ + and '"https://www.google.com/recaptcha/' in resp.text: + raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7) + + +def raise_for_captcha(resp): + raise_for_cloudflare_captcha(resp) + raise_for_recaptcha(resp) + + +def raise_for_httperror(resp): + """Raise exception for an HTTP response is an error. + + Args: + resp (requests.Response): Response to check + + Raises: + requests.HTTPError: raise by resp.raise_for_status() + searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403. + searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429. + searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected. + """ + if resp.status_code and resp.status_code >= 400: + raise_for_captcha(resp) + if resp.status_code in (402, 403): + raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code), + suspended_time=3600 * 24) + if resp.status_code == 429: + raise SearxEngineTooManyRequestsException() + resp.raise_for_status() diff --git a/searx/search.py b/searx/search.py index 8c2ad8d72..220950803 100644 --- a/searx/search.py +++ b/searx/search.py @@ -32,7 +32,8 @@ from searx.utils import gen_useragent from searx.results import ResultContainer from searx import logger from searx.plugins import plugins -from searx.exceptions import SearxEngineCaptchaException +from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException, + SearxEngineTooManyRequestsException,) from searx.metrology.error_recorder import record_exception, record_error @@ -131,6 +132,9 @@ def send_http_request(engine, request_params): # soft_max_redirects soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0) + # raise_for_status + request_args['raise_for_httperror'] = request_params.get('raise_for_httperror', False) + # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get @@ -142,10 +146,6 @@ def send_http_request(engine, request_params): # send the request response = req(request_params['url'], **request_args) - # check HTTP status - if request_params.get('raise_for_status'): - response.raise_for_status() - # check soft limit of the redirect count if len(response.history) > soft_max_redirects: # unexpected redirect : record an error @@ -191,6 +191,7 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont # suppose everything will be alright requests_exception = False + suspended_time = None try: # send requests and parse the results @@ -240,6 +241,15 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required') logger.exception('engine {0} : CAPTCHA') + suspended_time = e.suspended_time # pylint: disable=no-member + elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)): + result_container.add_unresponsive_engine(engine_name, 'too many requests') + logger.exception('engine {0} : Too many requests') + suspended_time = e.suspended_time # pylint: disable=no-member + elif (issubclass(e.__class__, SearxEngineAccessDeniedException)): + result_container.add_unresponsive_engine(engine_name, 'blocked') + logger.exception('engine {0} : Searx is blocked') + suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine(engine_name, 'unexpected crash') # others errors @@ -248,16 +258,18 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont if getattr(threading.current_thread(), '_timeout', False): record_error(engine_name, 'Timeout') - # suspend or not the engine if there are HTTP errors + # suspend the engine if there is an HTTP error + # or suspended_time is defined with threading.RLock(): - if requests_exception: + if requests_exception or suspended_time: # update continuous_errors / suspend_end_time engine.continuous_errors += 1 - engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'], - engine.continuous_errors * settings['search']['ban_time_on_fail']) + if suspended_time is None: + suspended_time = min(settings['search']['max_ban_time_on_fail'], + engine.continuous_errors * settings['search']['ban_time_on_fail']) + engine.suspend_end_time = time() + suspended_time else: - # no HTTP error (perhaps an engine error) - # anyway, reset the suspend variables + # reset the suspend variables engine.continuous_errors = 0 engine.suspend_end_time = 0 @@ -342,7 +354,7 @@ def default_request_params(): 'cookies': {}, 'verify': True, 'auth': None, - 'raise_for_status': True + 'raise_for_httperror': True } diff --git a/searx/settings.yml b/searx/settings.yml index 132bf620b..3ba9b745f 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -647,11 +647,6 @@ engines: shortcut : qwn categories : news - - name : qwant social - engine : qwant - shortcut : qws - categories : social media - # - name: library # engine: recoll # shortcut: lib @@ -817,12 +812,13 @@ engines: # Or you can use the html non-stable engine, activated by default engine : youtube_noapi - - name : yggtorrent - engine : yggtorrent - shortcut : ygg - url: https://www2.yggtorrent.si/ - disabled : True - timeout : 4.0 + # tmp suspended: Cloudflare CAPTCHA + #- name : yggtorrent + # engine : yggtorrent + # shortcut : ygg + # url: https://www2.yggtorrent.si/ + # disabled : True + # timeout : 4.0 - name : dailymotion engine : dailymotion