mirror of
https://github.com/searxng/searxng.git
synced 2025-01-08 17:35:25 +00:00
[enh] add raise_for_httperror
check HTTP response: * detect some comme CAPTCHA challenge (no solving). In this case the engine is suspended for long a time. * otherwise raise HTTPError as before the check is done in poolrequests.py (was before in search.py). update qwant, wikipedia, wikidata to use raise_for_httperror instead of raise_for_status
This commit is contained in:
parent
033f39bff7
commit
d703119d3a
11 changed files with 179 additions and 56 deletions
|
@ -134,19 +134,19 @@ The function ``def request(query, params):`` always returns the ``params``
|
|||
variable. Inside searx, the following paramters can be used to specify a search
|
||||
request:
|
||||
|
||||
================== =========== ==========================================================================
|
||||
argument type information
|
||||
================== =========== ==========================================================================
|
||||
url string requested url
|
||||
method string HTTP request method
|
||||
headers set HTTP header information
|
||||
data set HTTP data information (parsed if ``method != 'GET'``)
|
||||
cookies set HTTP cookies
|
||||
verify boolean Performing SSL-Validity check
|
||||
max_redirects int maximum redirects, hard limit
|
||||
soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine
|
||||
raise_for_status bool True by default: raise an exception if the HTTP code of response is >= 300
|
||||
================== =========== ==========================================================================
|
||||
=================== =========== ==========================================================================
|
||||
argument type information
|
||||
=================== =========== ==========================================================================
|
||||
url string requested url
|
||||
method string HTTP request method
|
||||
headers set HTTP header information
|
||||
data set HTTP data information (parsed if ``method != 'GET'``)
|
||||
cookies set HTTP cookies
|
||||
verify boolean Performing SSL-Validity check
|
||||
max_redirects int maximum redirects, hard limit
|
||||
soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine
|
||||
raise_for_httperror bool True by default: raise an exception if the HTTP code of response is >= 300
|
||||
=================== =========== ==========================================================================
|
||||
|
||||
|
||||
example code
|
||||
|
|
|
@ -281,8 +281,12 @@ def initialize_engines(engine_list):
|
|||
load_engines(engine_list)
|
||||
|
||||
def engine_init(engine_name, init_fn):
|
||||
init_fn(get_engine_from_settings(engine_name))
|
||||
logger.debug('%s engine: Initialized', engine_name)
|
||||
try:
|
||||
init_fn(get_engine_from_settings(engine_name))
|
||||
except Exception:
|
||||
logger.exception('%s engine: Fail to initialize', engine_name)
|
||||
else:
|
||||
logger.debug('%s engine: Initialized', engine_name)
|
||||
|
||||
for engine_name, engine in engines.items():
|
||||
if hasattr(engine, 'init'):
|
||||
|
|
|
@ -14,6 +14,8 @@ from datetime import datetime
|
|||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import html_to_text, match_language
|
||||
from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
|
||||
from searx.raise_for_httperror import raise_for_httperror
|
||||
|
||||
|
||||
# engine dependent config
|
||||
|
@ -24,8 +26,7 @@ supported_languages_url = 'https://qwant.com/region'
|
|||
|
||||
category_to_keyword = {'general': 'web',
|
||||
'images': 'images',
|
||||
'news': 'news',
|
||||
'social media': 'social'}
|
||||
'news': 'news'}
|
||||
|
||||
# search-url
|
||||
url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
|
||||
|
@ -51,6 +52,7 @@ def request(query, params):
|
|||
params['url'] += '&locale=' + language.replace('-', '_').lower()
|
||||
|
||||
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
|
||||
params['raise_for_httperror'] = False
|
||||
return params
|
||||
|
||||
|
||||
|
@ -58,8 +60,20 @@ def request(query, params):
|
|||
def response(resp):
|
||||
results = []
|
||||
|
||||
# According to https://www.qwant.com/js/app.js
|
||||
if resp.status_code == 429:
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
# raise for other errors
|
||||
raise_for_httperror(resp)
|
||||
|
||||
# load JSON result
|
||||
search_results = loads(resp.text)
|
||||
|
||||
# check for an API error
|
||||
if search_results.get('status') != 'success':
|
||||
raise SearxEngineAPIException('API error ' + str(search_results.get('error', '')))
|
||||
|
||||
# return empty array if there are no results
|
||||
if 'data' not in search_results:
|
||||
return []
|
||||
|
@ -90,15 +104,6 @@ def response(resp):
|
|||
'thumbnail_src': thumbnail_src,
|
||||
'img_src': img_src})
|
||||
|
||||
elif category_to_keyword.get(categories[0], '') == 'social':
|
||||
published_date = datetime.fromtimestamp(result['date'], None)
|
||||
img_src = result.get('img', None)
|
||||
results.append({'url': res_url,
|
||||
'title': title,
|
||||
'publishedDate': published_date,
|
||||
'content': content,
|
||||
'img_src': img_src})
|
||||
|
||||
elif category_to_keyword.get(categories[0], '') == 'news':
|
||||
published_date = datetime.fromtimestamp(result['date'], None)
|
||||
media = result.get('media', [])
|
||||
|
|
|
@ -161,9 +161,6 @@ def request(query, params):
|
|||
|
||||
def response(resp):
|
||||
results = []
|
||||
if resp.status_code != 200:
|
||||
logger.debug('SPARQL endpoint error %s', resp.content.decode())
|
||||
resp.raise_for_status()
|
||||
jsonresponse = loads(resp.content.decode())
|
||||
|
||||
language = resp.search_params['language'].lower()
|
||||
|
|
|
@ -14,6 +14,7 @@ from urllib.parse import quote
|
|||
from json import loads
|
||||
from lxml.html import fromstring
|
||||
from searx.utils import match_language, searx_useragent
|
||||
from searx.raise_for_httperror import raise_for_httperror
|
||||
|
||||
# search-url
|
||||
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
|
||||
|
@ -37,7 +38,7 @@ def request(query, params):
|
|||
language=url_lang(params['language']))
|
||||
|
||||
params['headers']['User-Agent'] = searx_useragent()
|
||||
params['raise_for_status'] = False
|
||||
params['raise_for_httperror'] = False
|
||||
params['soft_max_redirects'] = 2
|
||||
|
||||
return params
|
||||
|
@ -47,6 +48,7 @@ def request(query, params):
|
|||
def response(resp):
|
||||
if resp.status_code == 404:
|
||||
return []
|
||||
raise_for_httperror(resp)
|
||||
|
||||
results = []
|
||||
api_result = loads(resp.text)
|
||||
|
|
|
@ -64,8 +64,33 @@ class SearxEngineAPIException(SearxEngineResponseException):
|
|||
"""The website has returned an application error"""
|
||||
|
||||
|
||||
class SearxEngineCaptchaException(SearxEngineResponseException):
|
||||
"""The website has returned a CAPTCHA"""
|
||||
class SearxEngineAccessDeniedException(SearxEngineResponseException):
|
||||
"""The website is blocking the access"""
|
||||
|
||||
def __init__(self, suspended_time=24 * 3600, message='Access denied'):
|
||||
super().__init__(message + ', suspended_time=' + str(suspended_time))
|
||||
self.suspended_time = suspended_time
|
||||
self.message = message
|
||||
|
||||
|
||||
class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
|
||||
"""The website has returned a CAPTCHA
|
||||
|
||||
By default, searx stops sending requests to this engine for 1 day.
|
||||
"""
|
||||
|
||||
def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'):
|
||||
super().__init__(message=message, suspended_time=suspended_time)
|
||||
|
||||
|
||||
class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
|
||||
"""The website has returned a Too Many Request status code
|
||||
|
||||
By default, searx stops sending requests to this engine for 1 hour.
|
||||
"""
|
||||
|
||||
def __init__(self, suspended_time=3600, message='Too many request'):
|
||||
super().__init__(message=message, suspended_time=suspended_time)
|
||||
|
||||
|
||||
class SearxEngineXPathException(SearxEngineResponseException):
|
||||
|
|
|
@ -4,7 +4,8 @@ import logging
|
|||
from json import JSONDecodeError
|
||||
from urllib.parse import urlparse
|
||||
from requests.exceptions import RequestException
|
||||
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
||||
from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
|
||||
SearxEngineAccessDeniedException)
|
||||
from searx import logger
|
||||
|
||||
|
||||
|
@ -100,6 +101,10 @@ def get_messages(exc, filename) -> typing.Tuple:
|
|||
return (exc.xpath_str, exc.message)
|
||||
if isinstance(exc, SearxEngineXPathException):
|
||||
return (exc.xpath_str, exc.message)
|
||||
if isinstance(exc, SearxEngineAPIException):
|
||||
return (str(exc.args[0]), )
|
||||
if isinstance(exc, SearxEngineAccessDeniedException):
|
||||
return (exc.message, )
|
||||
return ()
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import requests
|
|||
|
||||
from searx import settings
|
||||
from searx import logger
|
||||
from searx.raise_for_httperror import raise_for_httperror
|
||||
|
||||
|
||||
logger = logger.getChild('poolrequests')
|
||||
|
@ -156,6 +157,12 @@ def request(method, url, **kwargs):
|
|||
if timeout is not None:
|
||||
kwargs['timeout'] = timeout
|
||||
|
||||
# raise_for_error
|
||||
check_for_httperror = True
|
||||
if 'raise_for_httperror' in kwargs:
|
||||
check_for_httperror = kwargs['raise_for_httperror']
|
||||
del kwargs['raise_for_httperror']
|
||||
|
||||
# do request
|
||||
response = session.request(method=method, url=url, **kwargs)
|
||||
|
||||
|
@ -176,6 +183,10 @@ def request(method, url, **kwargs):
|
|||
if hasattr(threadLocal, 'total_time'):
|
||||
threadLocal.total_time += time_after_request - time_before_request
|
||||
|
||||
# raise an exception
|
||||
if check_for_httperror:
|
||||
raise_for_httperror(response)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
|
|
66
searx/raise_for_httperror.py
Normal file
66
searx/raise_for_httperror.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Raise exception for an HTTP response is an error.
|
||||
"""
|
||||
from searx.exceptions import (SearxEngineCaptchaException, SearxEngineTooManyRequestsException,
|
||||
SearxEngineAccessDeniedException)
|
||||
|
||||
|
||||
def is_cloudflare_challenge(resp):
|
||||
if resp.status_code in [429, 503]:
|
||||
if ('__cf_chl_jschl_tk__=' in resp.text)\
|
||||
or ('/cdn-cgi/challenge-platform/' in resp.text
|
||||
and 'orchestrate/jsch/v1' in resp.text
|
||||
and 'window._cf_chl_enter(' in resp.text):
|
||||
return True
|
||||
if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_cloudflare_firewall(resp):
|
||||
return resp.status_code == 403 and '<span class="cf-error-code">1020</span>' in resp.text
|
||||
|
||||
|
||||
def raise_for_cloudflare_captcha(resp):
|
||||
if resp.headers.get('Server', '').startswith('cloudflare'):
|
||||
if is_cloudflare_challenge(resp):
|
||||
# https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha-
|
||||
# suspend for 2 weeks
|
||||
raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15)
|
||||
|
||||
if is_cloudflare_firewall(resp):
|
||||
raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24)
|
||||
|
||||
|
||||
def raise_for_recaptcha(resp):
|
||||
if resp.status_code == 503 \
|
||||
and '"https://www.google.com/recaptcha/' in resp.text:
|
||||
raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7)
|
||||
|
||||
|
||||
def raise_for_captcha(resp):
|
||||
raise_for_cloudflare_captcha(resp)
|
||||
raise_for_recaptcha(resp)
|
||||
|
||||
|
||||
def raise_for_httperror(resp):
|
||||
"""Raise exception for an HTTP response is an error.
|
||||
|
||||
Args:
|
||||
resp (requests.Response): Response to check
|
||||
|
||||
Raises:
|
||||
requests.HTTPError: raise by resp.raise_for_status()
|
||||
searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403.
|
||||
searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429.
|
||||
searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected.
|
||||
"""
|
||||
if resp.status_code and resp.status_code >= 400:
|
||||
raise_for_captcha(resp)
|
||||
if resp.status_code in (402, 403):
|
||||
raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code),
|
||||
suspended_time=3600 * 24)
|
||||
if resp.status_code == 429:
|
||||
raise SearxEngineTooManyRequestsException()
|
||||
resp.raise_for_status()
|
|
@ -32,7 +32,8 @@ from searx.utils import gen_useragent
|
|||
from searx.results import ResultContainer
|
||||
from searx import logger
|
||||
from searx.plugins import plugins
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
|
||||
SearxEngineTooManyRequestsException,)
|
||||
from searx.metrology.error_recorder import record_exception, record_error
|
||||
|
||||
|
||||
|
@ -131,6 +132,9 @@ def send_http_request(engine, request_params):
|
|||
# soft_max_redirects
|
||||
soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0)
|
||||
|
||||
# raise_for_status
|
||||
request_args['raise_for_httperror'] = request_params.get('raise_for_httperror', False)
|
||||
|
||||
# specific type of request (GET or POST)
|
||||
if request_params['method'] == 'GET':
|
||||
req = requests_lib.get
|
||||
|
@ -142,10 +146,6 @@ def send_http_request(engine, request_params):
|
|||
# send the request
|
||||
response = req(request_params['url'], **request_args)
|
||||
|
||||
# check HTTP status
|
||||
if request_params.get('raise_for_status'):
|
||||
response.raise_for_status()
|
||||
|
||||
# check soft limit of the redirect count
|
||||
if len(response.history) > soft_max_redirects:
|
||||
# unexpected redirect : record an error
|
||||
|
@ -191,6 +191,7 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
|
|||
|
||||
# suppose everything will be alright
|
||||
requests_exception = False
|
||||
suspended_time = None
|
||||
|
||||
try:
|
||||
# send requests and parse the results
|
||||
|
@ -240,6 +241,15 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
|
|||
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
|
||||
result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required')
|
||||
logger.exception('engine {0} : CAPTCHA')
|
||||
suspended_time = e.suspended_time # pylint: disable=no-member
|
||||
elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
|
||||
result_container.add_unresponsive_engine(engine_name, 'too many requests')
|
||||
logger.exception('engine {0} : Too many requests')
|
||||
suspended_time = e.suspended_time # pylint: disable=no-member
|
||||
elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
|
||||
result_container.add_unresponsive_engine(engine_name, 'blocked')
|
||||
logger.exception('engine {0} : Searx is blocked')
|
||||
suspended_time = e.suspended_time # pylint: disable=no-member
|
||||
else:
|
||||
result_container.add_unresponsive_engine(engine_name, 'unexpected crash')
|
||||
# others errors
|
||||
|
@ -248,16 +258,18 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
|
|||
if getattr(threading.current_thread(), '_timeout', False):
|
||||
record_error(engine_name, 'Timeout')
|
||||
|
||||
# suspend or not the engine if there are HTTP errors
|
||||
# suspend the engine if there is an HTTP error
|
||||
# or suspended_time is defined
|
||||
with threading.RLock():
|
||||
if requests_exception:
|
||||
if requests_exception or suspended_time:
|
||||
# update continuous_errors / suspend_end_time
|
||||
engine.continuous_errors += 1
|
||||
engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'],
|
||||
engine.continuous_errors * settings['search']['ban_time_on_fail'])
|
||||
if suspended_time is None:
|
||||
suspended_time = min(settings['search']['max_ban_time_on_fail'],
|
||||
engine.continuous_errors * settings['search']['ban_time_on_fail'])
|
||||
engine.suspend_end_time = time() + suspended_time
|
||||
else:
|
||||
# no HTTP error (perhaps an engine error)
|
||||
# anyway, reset the suspend variables
|
||||
# reset the suspend variables
|
||||
engine.continuous_errors = 0
|
||||
engine.suspend_end_time = 0
|
||||
|
||||
|
@ -342,7 +354,7 @@ def default_request_params():
|
|||
'cookies': {},
|
||||
'verify': True,
|
||||
'auth': None,
|
||||
'raise_for_status': True
|
||||
'raise_for_httperror': True
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -647,11 +647,6 @@ engines:
|
|||
shortcut : qwn
|
||||
categories : news
|
||||
|
||||
- name : qwant social
|
||||
engine : qwant
|
||||
shortcut : qws
|
||||
categories : social media
|
||||
|
||||
# - name: library
|
||||
# engine: recoll
|
||||
# shortcut: lib
|
||||
|
@ -817,12 +812,13 @@ engines:
|
|||
# Or you can use the html non-stable engine, activated by default
|
||||
engine : youtube_noapi
|
||||
|
||||
- name : yggtorrent
|
||||
engine : yggtorrent
|
||||
shortcut : ygg
|
||||
url: https://www2.yggtorrent.si/
|
||||
disabled : True
|
||||
timeout : 4.0
|
||||
# tmp suspended: Cloudflare CAPTCHA
|
||||
#- name : yggtorrent
|
||||
# engine : yggtorrent
|
||||
# shortcut : ygg
|
||||
# url: https://www2.yggtorrent.si/
|
||||
# disabled : True
|
||||
# timeout : 4.0
|
||||
|
||||
- name : dailymotion
|
||||
engine : dailymotion
|
||||
|
|
Loading…
Reference in a new issue