Merge pull request #2332 from dalf/metrology-errors

[enh] record exception details per engine
This commit is contained in:
Alexandre Flament 2020-12-03 10:31:44 +01:00 committed by GitHub
commit 89fbb85d45
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
24 changed files with 597 additions and 169 deletions

View file

@ -134,16 +134,18 @@ The function ``def request(query, params):`` always returns the ``params``
variable. Inside searx, the following paramters can be used to specify a search variable. Inside searx, the following paramters can be used to specify a search
request: request:
============ =========== ========================================================= ================== =========== ========================================================================
argument type information argument type information
============ =========== ========================================================= ================== =========== ========================================================================
url string requested url url string requested url
method string HTTP request method method string HTTP request method
headers set HTTP header information headers set HTTP header information
data set HTTP data information (parsed if ``method != 'GET'``) data set HTTP data information (parsed if ``method != 'GET'``)
cookies set HTTP cookies cookies set HTTP cookies
verify boolean Performing SSL-Validity check verify boolean Performing SSL-Validity check
============ =========== ========================================================= max_redirects int maximum redirects, hard limit
soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine
================== =========== ========================================================================
example code example code

View file

@ -1,6 +1,6 @@
from urllib.parse import quote, urljoin from urllib.parse import quote, urljoin
from lxml import html from lxml import html
from searx.utils import extract_text, get_torrent_size from searx.utils import extract_text, get_torrent_size, eval_xpath, eval_xpath_list, eval_xpath_getindex
url = 'https://1337x.to/' url = 'https://1337x.to/'
@ -20,12 +20,12 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for result in dom.xpath('//table[contains(@class, "table-list")]/tbody//tr'): for result in eval_xpath_list(dom, '//table[contains(@class, "table-list")]/tbody//tr'):
href = urljoin(url, result.xpath('./td[contains(@class, "name")]/a[2]/@href')[0]) href = urljoin(url, eval_xpath_getindex(result, './td[contains(@class, "name")]/a[2]/@href', 0))
title = extract_text(result.xpath('./td[contains(@class, "name")]/a[2]')) title = extract_text(eval_xpath(result, './td[contains(@class, "name")]/a[2]'))
seed = extract_text(result.xpath('.//td[contains(@class, "seeds")]')) seed = extract_text(eval_xpath(result, './/td[contains(@class, "seeds")]'))
leech = extract_text(result.xpath('.//td[contains(@class, "leeches")]')) leech = extract_text(eval_xpath(result, './/td[contains(@class, "leeches")]'))
filesize_info = extract_text(result.xpath('.//td[contains(@class, "size")]/text()')) filesize_info = extract_text(eval_xpath(result, './/td[contains(@class, "size")]/text()'))
filesize, filesize_multiplier = filesize_info.split() filesize, filesize_multiplier = filesize_info.split()
filesize = get_torrent_size(filesize, filesize_multiplier) filesize = get_torrent_size(filesize, filesize_multiplier)

View file

@ -132,8 +132,9 @@ def load_engine(engine_data):
lambda: engine._fetch_supported_languages(get(engine.supported_languages_url))) lambda: engine._fetch_supported_languages(get(engine.supported_languages_url)))
engine.stats = { engine.stats = {
'sent_search_count': 0, # sent search
'search_count': 0, # succesful search
'result_count': 0, 'result_count': 0,
'search_count': 0,
'engine_time': 0, 'engine_time': 0,
'engine_time_count': 0, 'engine_time_count': 0,
'score_count': 0, 'score_count': 0,

View file

@ -11,7 +11,7 @@
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import extract_text, get_torrent_size from searx.utils import extract_text, get_torrent_size, eval_xpath_list, eval_xpath_getindex
# engine dependent config # engine dependent config
categories = ['files', 'images', 'videos', 'music'] categories = ['files', 'images', 'videos', 'music']
@ -37,24 +37,21 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for result in dom.xpath(xpath_results): for result in eval_xpath_list(dom, xpath_results):
# defaults # defaults
filesize = 0 filesize = 0
magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce"
try: category = extract_text(eval_xpath_getindex(result, xpath_category, 0, default=[]))
category = extract_text(result.xpath(xpath_category)[0]) page_a = eval_xpath_getindex(result, xpath_title, 0)
except:
pass
page_a = result.xpath(xpath_title)[0]
title = extract_text(page_a) title = extract_text(page_a)
href = base_url + page_a.attrib.get('href') href = base_url + page_a.attrib.get('href')
magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5])
filesize_info = eval_xpath_getindex(result, xpath_filesize, 0, default=None)
if filesize_info:
try: try:
filesize_info = result.xpath(xpath_filesize)[0]
filesize = filesize_info[:-2] filesize = filesize_info[:-2]
filesize_multiplier = filesize_info[-2:] filesize_multiplier = filesize_info[-2:]
filesize = get_torrent_size(filesize, filesize_multiplier) filesize = get_torrent_size(filesize, filesize_multiplier)

View file

@ -12,7 +12,7 @@
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring from lxml.html import fromstring
from searx.engines.xpath import extract_url, extract_text from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath
# engine config # engine config
categories = ['onions'] categories = ['onions']
@ -50,17 +50,17 @@ def response(resp):
# trim results so there's not way too many at once # trim results so there's not way too many at once
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1) first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
all_results = dom.xpath(results_xpath) all_results = eval_xpath_list(dom, results_xpath)
trimmed_results = all_results[first_result_index:first_result_index + page_size] trimmed_results = all_results[first_result_index:first_result_index + page_size]
# get results # get results
for result in trimmed_results: for result in trimmed_results:
# remove ahmia url and extract the actual url for the result # remove ahmia url and extract the actual url for the result
raw_url = extract_url(result.xpath(url_xpath), search_url) raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0] cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
title = extract_text(result.xpath(title_xpath)) title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(result.xpath(content_xpath)) content = extract_text(eval_xpath(result, content_xpath))
results.append({'url': cleaned_url, results.append({'url': cleaned_url,
'title': title, 'title': title,
@ -68,11 +68,11 @@ def response(resp):
'is_onion': True}) 'is_onion': True})
# get spelling corrections # get spelling corrections
for correction in dom.xpath(correction_xpath): for correction in eval_xpath_list(dom, correction_xpath):
results.append({'correction': extract_text(correction)}) results.append({'correction': extract_text(correction)})
# get number of results # get number of results
number_of_results = dom.xpath(number_of_results_xpath) number_of_results = eval_xpath(dom, number_of_results_xpath)
if number_of_results: if number_of_results:
try: try:
results.append({'number_of_results': int(extract_text(number_of_results))}) results.append({'number_of_results': int(extract_text(number_of_results))})

View file

@ -11,7 +11,7 @@
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import extract_text from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
# engine dependent config # engine dependent config
@ -42,12 +42,13 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath('.//div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'): for result in eval_xpath_list(dom, './/div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'):
link = result.xpath('.//h5/a')[0] link = eval_xpath_getindex(result, './/h5/a', 0)
url = base_url + link.attrib.get('href') + '#downloads' url = base_url + link.attrib.get('href') + '#downloads'
title = extract_text(link) title = extract_text(link)
thumbnail_src = base_url + result.xpath('.//img')[0].attrib.get('src').replace('&w=32&h=32', '&w=64&h=64') thumbnail_src = base_url\
+ eval_xpath_getindex(result, './/img', 0).attrib.get('src').replace('&w=32&h=32', '&w=64&h=64')
res = { res = {
'url': url, 'url': url,

View file

@ -13,7 +13,7 @@
from urllib.parse import urlencode, urljoin from urllib.parse import urlencode, urljoin
from lxml import html from lxml import html
from searx.utils import extract_text from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
# engine dependent config # engine dependent config
categories = ['it'] categories = ['it']
@ -131,8 +131,8 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath(xpath_results): for result in eval_xpath_list(dom, xpath_results):
link = result.xpath(xpath_link)[0] link = eval_xpath_getindex(result, xpath_link, 0)
href = urljoin(base_url, link.attrib.get('href')) href = urljoin(base_url, link.attrib.get('href'))
title = extract_text(link) title = extract_text(link)

View file

@ -13,6 +13,7 @@
from lxml import html from lxml import html
from datetime import datetime from datetime import datetime
from searx.utils import eval_xpath_list, eval_xpath_getindex
categories = ['science'] categories = ['science']
@ -42,29 +43,26 @@ def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
search_results = dom.xpath('//entry')
for entry in search_results: for entry in eval_xpath_list(dom, '//entry'):
title = entry.xpath('.//title')[0].text title = eval_xpath_getindex(entry, './/title', 0).text
url = entry.xpath('.//id')[0].text url = eval_xpath_getindex(entry, './/id', 0).text
content_string = '{doi_content}{abstract_content}' content_string = '{doi_content}{abstract_content}'
abstract = entry.xpath('.//summary')[0].text abstract = eval_xpath_getindex(entry, './/summary', 0).text
# If a doi is available, add it to the snipppet # If a doi is available, add it to the snipppet
try: doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None)
doi_content = entry.xpath('.//link[@title="doi"]')[0].text doi_content = doi_element.text if doi_element is not None else ''
content = content_string.format(doi_content=doi_content, abstract_content=abstract) content = content_string.format(doi_content=doi_content, abstract_content=abstract)
except:
content = content_string.format(doi_content="", abstract_content=abstract)
if len(content) > 300: if len(content) > 300:
content = content[0:300] + "..." content = content[0:300] + "..."
# TODO: center snippet on query term # TODO: center snippet on query term
publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ') publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {'url': url, res_dict = {'url': url,
'title': title, 'title': title,

View file

@ -15,7 +15,8 @@ from datetime import datetime
from dateutil import parser from dateutil import parser
from urllib.parse import urlencode, urlparse, parse_qsl from urllib.parse import urlencode, urlparse, parse_qsl
from lxml import etree from lxml import etree
from searx.utils import list_get, match_language from lxml.etree import XPath
from searx.utils import match_language, eval_xpath_getindex
from searx.engines.bing import language_aliases from searx.engines.bing import language_aliases
from searx.engines.bing import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import from searx.engines.bing import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
@ -94,12 +95,12 @@ def response(resp):
# parse results # parse results
for item in rss.xpath('./channel/item'): for item in rss.xpath('./channel/item'):
# url / title / content # url / title / content
url = url_cleanup(item.xpath('./link/text()')[0]) url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None))
title = list_get(item.xpath('./title/text()'), 0, url) title = eval_xpath_getindex(item, './title/text()', 0, default=url)
content = list_get(item.xpath('./description/text()'), 0, '') content = eval_xpath_getindex(item, './description/text()', 0, default='')
# publishedDate # publishedDate
publishedDate = list_get(item.xpath('./pubDate/text()'), 0) publishedDate = eval_xpath_getindex(item, './pubDate/text()', 0, default=None)
try: try:
publishedDate = parser.parse(publishedDate, dayfirst=False) publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError: except TypeError:
@ -108,7 +109,7 @@ def response(resp):
publishedDate = datetime.now() publishedDate = datetime.now()
# thumbnail # thumbnail
thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0) thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=ns), 0, default=None)
if thumbnail is not None: if thumbnail is not None:
thumbnail = image_url_cleanup(thumbnail) thumbnail = image_url_cleanup(thumbnail)

View file

@ -15,6 +15,7 @@
from json import loads from json import loads
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
from searx.engines.duckduckgo import get_region_code from searx.engines.duckduckgo import get_region_code
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
from searx.poolrequests import get from searx.poolrequests import get
@ -37,7 +38,7 @@ def get_vqd(query, headers):
res = get(query_url, headers=headers) res = get(query_url, headers=headers)
content = res.text content = res.text
if content.find('vqd=\'') == -1: if content.find('vqd=\'') == -1:
raise Exception('Request failed') raise SearxEngineAPIException('Request failed')
vqd = content[content.find('vqd=\'') + 5:] vqd = content[content.find('vqd=\'') + 5:]
vqd = vqd[:vqd.find('\'')] vqd = vqd[:vqd.find('\'')]
return vqd return vqd
@ -71,10 +72,7 @@ def response(resp):
results = [] results = []
content = resp.text content = resp.text
try:
res_json = loads(content) res_json = loads(content)
except:
raise Exception('Cannot parse results')
# parse results # parse results
for result in res_json['results']: for result in res_json['results']:

View file

@ -1,5 +1,6 @@
from json import loads, dumps from json import loads, dumps
from requests.auth import HTTPBasicAuth from requests.auth import HTTPBasicAuth
from searx.exceptions import SearxEngineAPIException
base_url = 'http://localhost:9200' base_url = 'http://localhost:9200'
@ -107,7 +108,7 @@ def response(resp):
resp_json = loads(resp.text) resp_json = loads(resp.text)
if 'error' in resp_json: if 'error' in resp_json:
raise Exception(resp_json['error']) raise SearxEngineAPIException(resp_json['error'])
for result in resp_json['hits']['hits']: for result in resp_json['hits']['hits']:
r = {key: str(value) if not key.startswith('_') else value for key, value in result['_source'].items()} r = {key: str(value) if not key.startswith('_') else value for key, value in result['_source'].items()}

View file

@ -20,9 +20,10 @@ Definitions`_.
from urllib.parse import urlencode, urlparse from urllib.parse import urlencode, urlparse
from lxml import html from lxml import html
from flask_babel import gettext
from searx import logger from searx import logger
from searx.utils import match_language, extract_text, eval_xpath from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.exceptions import SearxEngineCaptchaException
logger = logger.getChild('google engine') logger = logger.getChild('google engine')
@ -131,14 +132,6 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a'
spelling_suggestion_xpath = '//div[@class="med"]/p/a' spelling_suggestion_xpath = '//div[@class="med"]/p/a'
def extract_text_from_dom(result, xpath):
"""returns extract_text on the first result selected by the xpath or None"""
r = eval_xpath(result, xpath)
if len(r) > 0:
return extract_text(r[0])
return None
def get_lang_country(params, lang_list, custom_aliases): def get_lang_country(params, lang_list, custom_aliases):
"""Returns a tuple with *langauage* on its first and *country* on its second """Returns a tuple with *langauage* on its first and *country* on its second
position.""" position."""
@ -210,10 +203,10 @@ def response(resp):
# detect google sorry # detect google sorry
resp_url = urlparse(resp.url) resp_url = urlparse(resp.url)
if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
raise RuntimeWarning('sorry.google.com') raise SearxEngineCaptchaException()
if resp_url.path.startswith('/sorry'): if resp_url.path.startswith('/sorry'):
raise RuntimeWarning(gettext('CAPTCHA required')) raise SearxEngineCaptchaException()
# which subdomain ? # which subdomain ?
# subdomain = resp.search_params.get('google_subdomain') # subdomain = resp.search_params.get('google_subdomain')
@ -230,17 +223,16 @@ def response(resp):
# results --> number_of_results # results --> number_of_results
try: try:
_txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0] _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0)
_digit = ''.join([n for n in _txt if n.isdigit()]) _digit = ''.join([n for n in _txt if n.isdigit()])
number_of_results = int(_digit) number_of_results = int(_digit)
results.append({'number_of_results': number_of_results}) results.append({'number_of_results': number_of_results})
except Exception as e: # pylint: disable=broad-except except Exception as e: # pylint: disable=broad-except
logger.debug("did not 'number_of_results'") logger.debug("did not 'number_of_results'")
logger.error(e, exc_info=True) logger.error(e, exc_info=True)
# parse results # parse results
for result in eval_xpath(dom, results_xpath): for result in eval_xpath_list(dom, results_xpath):
# google *sections* # google *sections*
if extract_text(eval_xpath(result, g_section_with_header)): if extract_text(eval_xpath(result, g_section_with_header)):
@ -248,14 +240,14 @@ def response(resp):
continue continue
try: try:
title_tag = eval_xpath(result, title_xpath) title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
if not title_tag: if title_tag is None:
# this not one of the common google results *section* # this not one of the common google results *section*
logger.debug('ingoring <div class="g" ../> section: missing title') logger.debug('ingoring <div class="g" ../> section: missing title')
continue continue
title = extract_text(title_tag[0]) title = extract_text(title_tag)
url = eval_xpath(result, href_xpath)[0] url = eval_xpath_getindex(result, href_xpath, 0)
content = extract_text_from_dom(result, content_xpath) content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
results.append({ results.append({
'url': url, 'url': url,
'title': title, 'title': title,
@ -270,11 +262,11 @@ def response(resp):
continue continue
# parse suggestion # parse suggestion
for suggestion in eval_xpath(dom, suggestion_xpath): for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion # append suggestion
results.append({'suggestion': extract_text(suggestion)}) results.append({'suggestion': extract_text(suggestion)})
for correction in eval_xpath(dom, spelling_suggestion_xpath): for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
results.append({'correction': extract_text(correction)}) results.append({'correction': extract_text(correction)})
# return results # return results
@ -286,7 +278,7 @@ def _fetch_supported_languages(resp):
ret_val = {} ret_val = {}
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
radio_buttons = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]') radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]')
for x in radio_buttons: for x in radio_buttons:
name = x.get("data-name") name = x.get("data-name")

View file

@ -26,8 +26,8 @@ Definitions`_.
from urllib.parse import urlencode, urlparse, unquote from urllib.parse import urlencode, urlparse, unquote
from lxml import html from lxml import html
from flask_babel import gettext
from searx import logger from searx import logger
from searx.exceptions import SearxEngineCaptchaException
from searx.utils import extract_text, eval_xpath from searx.utils import extract_text, eval_xpath
from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
@ -128,10 +128,10 @@ def response(resp):
# detect google sorry # detect google sorry
resp_url = urlparse(resp.url) resp_url = urlparse(resp.url)
if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
raise RuntimeWarning('sorry.google.com') raise SearxEngineCaptchaException()
if resp_url.path.startswith('/sorry'): if resp_url.path.startswith('/sorry'):
raise RuntimeWarning(gettext('CAPTCHA required')) raise SearxEngineCaptchaException()
# which subdomain ? # which subdomain ?
# subdomain = resp.search_params.get('google_subdomain') # subdomain = resp.search_params.get('google_subdomain')

View file

@ -13,7 +13,7 @@
from datetime import date, timedelta from datetime import date, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import extract_text from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
import re import re
# engine dependent config # engine dependent config
@ -66,11 +66,11 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath('//div[@class="g"]'): for result in eval_xpath_list(dom, '//div[@class="g"]'):
title = extract_text(result.xpath('.//h3')) title = extract_text(eval_xpath(result, './/h3'))
url = result.xpath('.//div[@class="r"]/a/@href')[0] url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0)
content = extract_text(result.xpath('.//span[@class="st"]')) content = extract_text(eval_xpath(result, './/span[@class="st"]'))
# get thumbnails # get thumbnails
script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)

View file

@ -1,6 +1,6 @@
from lxml import html from lxml import html
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import extract_text, extract_url, eval_xpath from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
search_url = None search_url = None
url_xpath = None url_xpath = None
@ -42,21 +42,22 @@ def response(resp):
is_onion = True if 'onions' in categories else False is_onion = True if 'onions' in categories else False
if results_xpath: if results_xpath:
for result in eval_xpath(dom, results_xpath): for result in eval_xpath_list(dom, results_xpath):
url = extract_url(eval_xpath(result, url_xpath), search_url) url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
title = extract_text(eval_xpath(result, title_xpath)) title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
content = extract_text(eval_xpath(result, content_xpath)) content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
tmp_result = {'url': url, 'title': title, 'content': content} tmp_result = {'url': url, 'title': title, 'content': content}
# add thumbnail if available # add thumbnail if available
if thumbnail_xpath: if thumbnail_xpath:
thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath) thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath)
if len(thumbnail_xpath_result) > 0: if len(thumbnail_xpath_result) > 0:
tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url) tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
# add alternative cached url if available # add alternative cached url if available
if cached_xpath: if cached_xpath:
tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath)) tmp_result['cached_url'] = cached_url\
+ extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
if is_onion: if is_onion:
tmp_result['is_onion'] = True tmp_result['is_onion'] = True
@ -66,19 +67,19 @@ def response(resp):
if cached_xpath: if cached_xpath:
for url, title, content, cached in zip( for url, title, content, cached in zip(
(extract_url(x, search_url) for (extract_url(x, search_url) for
x in dom.xpath(url_xpath)), x in eval_xpath_list(dom, url_xpath)),
map(extract_text, dom.xpath(title_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)),
map(extract_text, dom.xpath(content_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)),
map(extract_text, dom.xpath(cached_xpath)) map(extract_text, eval_xpath_list(dom, cached_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, results.append({'url': url, 'title': title, 'content': content,
'cached_url': cached_url + cached, 'is_onion': is_onion}) 'cached_url': cached_url + cached, 'is_onion': is_onion})
else: else:
for url, title, content in zip( for url, title, content in zip(
(extract_url(x, search_url) for (extract_url(x, search_url) for
x in dom.xpath(url_xpath)), x in eval_xpath_list(dom, url_xpath)),
map(extract_text, dom.xpath(title_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)),
map(extract_text, dom.xpath(content_xpath)) map(extract_text, eval_xpath_list(dom, content_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})

View file

@ -11,6 +11,7 @@
from json import loads from json import loads
from dateutil import parser from dateutil import parser
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
# engine dependent config # engine dependent config
categories = ['videos', 'music'] categories = ['videos', 'music']
@ -48,7 +49,7 @@ def response(resp):
search_results = loads(resp.text) search_results = loads(resp.text)
if 'error' in search_results and 'message' in search_results['error']: if 'error' in search_results and 'message' in search_results['error']:
raise Exception(search_results['error']['message']) raise SearxEngineAPIException(search_results['error']['message'])
# return empty array if there are no results # return empty array if there are no results
if 'items' not in search_results: if 'items' not in search_results:

View file

@ -34,8 +34,45 @@ class SearxParameterException(SearxException):
class SearxSettingsException(SearxException): class SearxSettingsException(SearxException):
"""Error while loading the settings"""
def __init__(self, message, filename): def __init__(self, message, filename):
super().__init__(message) super().__init__(message)
self.message = message self.message = message
self.filename = filename self.filename = filename
class SearxEngineException(SearxException):
"""Error inside an engine"""
class SearxXPathSyntaxException(SearxEngineException):
"""Syntax error in a XPATH"""
def __init__(self, xpath_spec, message):
super().__init__(str(xpath_spec) + " " + message)
self.message = message
# str(xpath_spec) to deal with str and XPath instance
self.xpath_str = str(xpath_spec)
class SearxEngineResponseException(SearxEngineException):
"""Impossible to parse the result of an engine"""
class SearxEngineAPIException(SearxEngineResponseException):
"""The website has returned an application error"""
class SearxEngineCaptchaException(SearxEngineResponseException):
"""The website has returned a CAPTCHA"""
class SearxEngineXPathException(SearxEngineResponseException):
"""Error while getting the result of an XPath expression"""
def __init__(self, xpath_spec, message):
super().__init__(str(xpath_spec) + " " + message)
self.message = message
# str(xpath_spec) to deal with str and XPath instance
self.xpath_str = str(xpath_spec)

View file

View file

@ -0,0 +1,142 @@
import typing
import inspect
import logging
from json import JSONDecodeError
from urllib.parse import urlparse
from requests.exceptions import RequestException
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import logger
logging.basicConfig(level=logging.INFO)
errors_per_engines = {}
class ErrorContext:
__slots__ = 'filename', 'function', 'line_no', 'code', 'exception_classname', 'log_message', 'log_parameters'
def __init__(self, filename, function, line_no, code, exception_classname, log_message, log_parameters):
self.filename = filename
self.function = function
self.line_no = line_no
self.code = code
self.exception_classname = exception_classname
self.log_message = log_message
self.log_parameters = log_parameters
def __eq__(self, o) -> bool:
if not isinstance(o, ErrorContext):
return False
return self.filename == o.filename and self.function == o.function and self.line_no == o.line_no\
and self.code == o.code and self.exception_classname == o.exception_classname\
and self.log_message == o.log_message and self.log_parameters == o.log_parameters
def __hash__(self):
return hash((self.filename, self.function, self.line_no, self.code, self.exception_classname, self.log_message,
self.log_parameters))
def __repr__(self):
return "ErrorContext({!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
format(self.filename, self.line_no, self.code, self.exception_classname, self.log_message,
self.log_parameters)
def add_error_context(engine_name: str, error_context: ErrorContext) -> None:
errors_for_engine = errors_per_engines.setdefault(engine_name, {})
errors_for_engine[error_context] = errors_for_engine.get(error_context, 0) + 1
logger.debug('⚠️ %s: %s', engine_name, str(error_context))
def get_trace(traces):
previous_trace = traces[-1]
for trace in reversed(traces):
if trace.filename.endswith('searx/search.py'):
if previous_trace.filename.endswith('searx/poolrequests.py'):
return trace
if previous_trace.filename.endswith('requests/models.py'):
return trace
return previous_trace
previous_trace = trace
return traces[-1]
def get_hostname(exc: RequestException) -> typing.Optional[None]:
url = exc.request.url
if url is None and exc.response is not None:
url = exc.response.url
return urlparse(url).netloc
def get_request_exception_messages(exc: RequestException)\
-> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]:
url = None
status_code = None
reason = None
hostname = None
if exc.request is not None:
url = exc.request.url
if url is None and exc.response is not None:
url = exc.response.url
if url is not None:
hostname = str(urlparse(url).netloc)
if exc.response is not None:
status_code = str(exc.response.status_code)
reason = exc.response.reason
return (status_code, reason, hostname)
def get_messages(exc, filename) -> typing.Tuple:
if isinstance(exc, JSONDecodeError):
return (exc.msg, )
if isinstance(exc, TypeError):
return (str(exc), )
if isinstance(exc, ValueError) and 'lxml' in filename:
return (str(exc), )
if isinstance(exc, RequestException):
return get_request_exception_messages(exc)
if isinstance(exc, SearxXPathSyntaxException):
return (exc.xpath_str, exc.message)
if isinstance(exc, SearxEngineXPathException):
return (exc.xpath_str, exc.message)
return ()
def get_exception_classname(exc: Exception) -> str:
exc_class = exc.__class__
exc_name = exc_class.__qualname__
exc_module = exc_class.__module__
if exc_module is None or exc_module == str.__class__.__module__:
return exc_name
return exc_module + '.' + exc_name
def get_error_context(framerecords, exception_classname, log_message, log_parameters) -> ErrorContext:
searx_frame = get_trace(framerecords)
filename = searx_frame.filename
function = searx_frame.function
line_no = searx_frame.lineno
code = searx_frame.code_context[0].strip()
del framerecords
return ErrorContext(filename, function, line_no, code, exception_classname, log_message, log_parameters)
def record_exception(engine_name: str, exc: Exception) -> None:
framerecords = inspect.trace()
try:
exception_classname = get_exception_classname(exc)
log_parameters = get_messages(exc, framerecords[-1][1])
error_context = get_error_context(framerecords, exception_classname, None, log_parameters)
add_error_context(engine_name, error_context)
finally:
del framerecords
def record_error(engine_name: str, log_message: str, log_parameters: typing.Optional[typing.Tuple] = None) -> None:
framerecords = list(reversed(inspect.stack()[1:]))
try:
error_context = get_error_context(framerecords, None, log_message, log_parameters or ())
add_error_context(engine_name, error_context)
finally:
del framerecords

View file

@ -4,6 +4,7 @@ from threading import RLock
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
from searx import logger from searx import logger
from searx.engines import engines from searx.engines import engines
from searx.metrology.error_recorder import record_error
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
@ -161,6 +162,7 @@ class ResultContainer:
def extend(self, engine_name, results): def extend(self, engine_name, results):
standard_result_count = 0 standard_result_count = 0
error_msgs = set()
for result in list(results): for result in list(results):
result['engine'] = engine_name result['engine'] = engine_name
if 'suggestion' in result: if 'suggestion' in result:
@ -177,14 +179,21 @@ class ResultContainer:
# standard result (url, title, content) # standard result (url, title, content)
if 'url' in result and not isinstance(result['url'], str): if 'url' in result and not isinstance(result['url'], str):
logger.debug('result: invalid URL: %s', str(result)) logger.debug('result: invalid URL: %s', str(result))
error_msgs.add('invalid URL')
elif 'title' in result and not isinstance(result['title'], str): elif 'title' in result and not isinstance(result['title'], str):
logger.debug('result: invalid title: %s', str(result)) logger.debug('result: invalid title: %s', str(result))
error_msgs.add('invalid title')
elif 'content' in result and not isinstance(result['content'], str): elif 'content' in result and not isinstance(result['content'], str):
logger.debug('result: invalid content: %s', str(result)) logger.debug('result: invalid content: %s', str(result))
error_msgs.add('invalid content')
else: else:
self._merge_result(result, standard_result_count + 1) self._merge_result(result, standard_result_count + 1)
standard_result_count += 1 standard_result_count += 1
if len(error_msgs) > 0:
for msg in error_msgs:
record_error(engine_name, 'some results are invalids: ' + msg)
if engine_name in engines: if engine_name in engines:
with RLock(): with RLock():
engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['search_count'] += 1

View file

@ -20,6 +20,7 @@ import gc
import threading import threading
from time import time from time import time
from uuid import uuid4 from uuid import uuid4
from urllib.parse import urlparse
from _thread import start_new_thread from _thread import start_new_thread
import requests.exceptions import requests.exceptions
@ -31,6 +32,8 @@ from searx.utils import gen_useragent
from searx.results import ResultContainer from searx.results import ResultContainer
from searx import logger from searx import logger
from searx.plugins import plugins from searx.plugins import plugins
from searx.exceptions import SearxEngineCaptchaException
from searx.metrology.error_recorder import record_exception, record_error
logger = logger.getChild('search') logger = logger.getChild('search')
@ -120,6 +123,14 @@ def send_http_request(engine, request_params):
if hasattr(engine, 'proxies'): if hasattr(engine, 'proxies'):
request_args['proxies'] = requests_lib.get_proxies(engine.proxies) request_args['proxies'] = requests_lib.get_proxies(engine.proxies)
# max_redirects
max_redirects = request_params.get('max_redirects')
if max_redirects:
request_args['max_redirects'] = max_redirects
# soft_max_redirects
soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0)
# specific type of request (GET or POST) # specific type of request (GET or POST)
if request_params['method'] == 'GET': if request_params['method'] == 'GET':
req = requests_lib.get req = requests_lib.get
@ -129,7 +140,23 @@ def send_http_request(engine, request_params):
request_args['data'] = request_params['data'] request_args['data'] = request_params['data']
# send the request # send the request
return req(request_params['url'], **request_args) response = req(request_params['url'], **request_args)
# check HTTP status
response.raise_for_status()
# check soft limit of the redirect count
if len(response.history) > soft_max_redirects:
# unexpected redirect : record an error
# but the engine might still return valid results.
status_code = str(response.status_code or '')
reason = response.reason or ''
hostname = str(urlparse(response.url or '').netloc)
record_error(engine.name,
'{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
(status_code, reason, hostname))
return response
def search_one_http_request(engine, query, request_params): def search_one_http_request(engine, query, request_params):
@ -183,8 +210,9 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
# update stats with the total HTTP time # update stats with the total HTTP time
engine.stats['page_load_time'] += page_load_time engine.stats['page_load_time'] += page_load_time
engine.stats['page_load_count'] += 1 engine.stats['page_load_count'] += 1
except Exception as e: except Exception as e:
record_exception(engine_name, e)
# Timing # Timing
engine_time = time() - start_time engine_time = time() - start_time
page_load_time = requests_lib.get_time_for_thread() page_load_time = requests_lib.get_time_for_thread()
@ -195,23 +223,29 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
engine.stats['errors'] += 1 engine.stats['errors'] += 1
if (issubclass(e.__class__, requests.exceptions.Timeout)): if (issubclass(e.__class__, requests.exceptions.Timeout)):
result_container.add_unresponsive_engine(engine_name, 'timeout') result_container.add_unresponsive_engine(engine_name, 'HTTP timeout')
# requests timeout (connect or read) # requests timeout (connect or read)
logger.error("engine {0} : HTTP requests timeout" logger.error("engine {0} : HTTP requests timeout"
"(search duration : {1} s, timeout: {2} s) : {3}" "(search duration : {1} s, timeout: {2} s) : {3}"
.format(engine_name, engine_time, timeout_limit, e.__class__.__name__)) .format(engine_name, engine_time, timeout_limit, e.__class__.__name__))
requests_exception = True requests_exception = True
elif (issubclass(e.__class__, requests.exceptions.RequestException)): elif (issubclass(e.__class__, requests.exceptions.RequestException)):
result_container.add_unresponsive_engine(engine_name, 'request exception') result_container.add_unresponsive_engine(engine_name, 'HTTP error')
# other requests exception # other requests exception
logger.exception("engine {0} : requests exception" logger.exception("engine {0} : requests exception"
"(search duration : {1} s, timeout: {2} s) : {3}" "(search duration : {1} s, timeout: {2} s) : {3}"
.format(engine_name, engine_time, timeout_limit, e)) .format(engine_name, engine_time, timeout_limit, e))
requests_exception = True requests_exception = True
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required')
logger.exception('engine {0} : CAPTCHA')
else: else:
result_container.add_unresponsive_engine(engine_name, 'unexpected crash', str(e)) result_container.add_unresponsive_engine(engine_name, 'unexpected crash')
# others errors # others errors
logger.exception('engine {0} : exception : {1}'.format(engine_name, e)) logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
else:
if getattr(threading.current_thread(), '_timeout', False):
record_error(engine_name, 'Timeout')
# suspend or not the engine if there are HTTP errors # suspend or not the engine if there are HTTP errors
with threading.RLock(): with threading.RLock():
@ -255,12 +289,17 @@ def search_one_offline_request_safe(engine_name, query, request_params, result_c
engine.stats['engine_time_count'] += 1 engine.stats['engine_time_count'] += 1
except ValueError as e: except ValueError as e:
record_exception(engine_name, e)
record_offline_engine_stats_on_error(engine, result_container, start_time) record_offline_engine_stats_on_error(engine, result_container, start_time)
logger.exception('engine {0} : invalid input : {1}'.format(engine_name, e)) logger.exception('engine {0} : invalid input : {1}'.format(engine_name, e))
except Exception as e: except Exception as e:
record_exception(engine_name, e)
record_offline_engine_stats_on_error(engine, result_container, start_time) record_offline_engine_stats_on_error(engine, result_container, start_time)
result_container.add_unresponsive_engine(engine_name, 'unexpected crash', str(e)) result_container.add_unresponsive_engine(engine_name, 'unexpected crash', str(e))
logger.exception('engine {0} : exception : {1}'.format(engine_name, e)) logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
else:
if getattr(threading.current_thread(), '_timeout', False):
record_error(engine_name, 'Timeout')
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
@ -278,6 +317,7 @@ def search_multiple_requests(requests, result_container, start_time, timeout_lim
args=(engine_name, query, request_params, result_container, start_time, timeout_limit), args=(engine_name, query, request_params, result_container, start_time, timeout_limit),
name=search_id, name=search_id,
) )
th._timeout = False
th._engine_name = engine_name th._engine_name = engine_name
th.start() th.start()
@ -286,6 +326,7 @@ def search_multiple_requests(requests, result_container, start_time, timeout_lim
remaining_time = max(0.0, timeout_limit - (time() - start_time)) remaining_time = max(0.0, timeout_limit - (time() - start_time))
th.join(remaining_time) th.join(remaining_time)
if th.is_alive(): if th.is_alive():
th._timeout = True
result_container.add_unresponsive_engine(th._engine_name, 'timeout') result_container.add_unresponsive_engine(th._engine_name, 'timeout')
logger.warning('engine timeout: {0}'.format(th._engine_name)) logger.warning('engine timeout: {0}'.format(th._engine_name))
@ -385,6 +426,9 @@ class Search:
request_params['category'] = engineref.category request_params['category'] = engineref.category
request_params['pageno'] = self.search_query.pageno request_params['pageno'] = self.search_query.pageno
with threading.RLock():
engine.stats['sent_search_count'] += 1
return request_params, engine.timeout return request_params, engine.timeout
# do search-request # do search-request

View file

@ -10,7 +10,7 @@ from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from lxml import html from lxml import html
from lxml.etree import XPath, _ElementStringResult, _ElementUnicodeResult from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
from babel.core import get_global from babel.core import get_global
@ -18,6 +18,7 @@ from searx import settings
from searx.data import USER_AGENTS from searx.data import USER_AGENTS
from searx.version import VERSION_STRING from searx.version import VERSION_STRING
from searx.languages import language_codes from searx.languages import language_codes
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import logger from searx import logger
@ -33,6 +34,13 @@ xpath_cache = dict()
lang_to_lc_cache = dict() lang_to_lc_cache = dict()
class NotSetClass:
pass
NOTSET = NotSetClass()
def searx_useragent(): def searx_useragent():
"""Return the searx User Agent""" """Return the searx User Agent"""
return 'searx/{searx_version} {suffix}'.format( return 'searx/{searx_version} {suffix}'.format(
@ -125,7 +133,7 @@ def html_to_text(html_str):
return s.get_text() return s.get_text()
def extract_text(xpath_results): def extract_text(xpath_results, allow_none=False):
"""Extract text from a lxml result """Extract text from a lxml result
* if xpath_results is list, extract the text from each result and concat the list * if xpath_results is list, extract the text from each result and concat the list
@ -133,22 +141,27 @@ def extract_text(xpath_results):
( text_content() method from lxml ) ( text_content() method from lxml )
* if xpath_results is a string element, then it's already done * if xpath_results is a string element, then it's already done
""" """
if type(xpath_results) == list: if isinstance(xpath_results, list):
# it's list of result : concat everything using recursive call # it's list of result : concat everything using recursive call
result = '' result = ''
for e in xpath_results: for e in xpath_results:
result = result + extract_text(e) result = result + extract_text(e)
return result.strip() return result.strip()
elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: elif isinstance(xpath_results, ElementBase):
# it's a string
return ''.join(xpath_results)
else:
# it's a element # it's a element
text = html.tostring( text = html.tostring(
xpath_results, encoding='unicode', method='text', with_tail=False xpath_results, encoding='unicode', method='text', with_tail=False
) )
text = text.strip().replace('\n', ' ') text = text.strip().replace('\n', ' ')
return ' '.join(text.split()) return ' '.join(text.split())
elif isinstance(xpath_results, (_ElementStringResult, _ElementUnicodeResult, str, Number, bool)):
return str(xpath_results)
elif xpath_results is None and allow_none:
return None
elif xpath_results is None and not allow_none:
raise ValueError('extract_text(None, allow_none=False)')
else:
raise ValueError('unsupported type')
def normalize_url(url, base_url): def normalize_url(url, base_url):
@ -170,7 +183,7 @@ def normalize_url(url, base_url):
>>> normalize_url('', 'https://example.com') >>> normalize_url('', 'https://example.com')
'https://example.com/' 'https://example.com/'
>>> normalize_url('/test', '/path') >>> normalize_url('/test', '/path')
raise Exception raise ValueError
Raises: Raises:
* lxml.etree.ParserError * lxml.etree.ParserError
@ -194,7 +207,7 @@ def normalize_url(url, base_url):
# add a / at this end of the url if there is no path # add a / at this end of the url if there is no path
if not parsed_url.netloc: if not parsed_url.netloc:
raise Exception('Cannot parse url') raise ValueError('Cannot parse url')
if not parsed_url.path: if not parsed_url.path:
url += '/' url += '/'
@ -224,17 +237,17 @@ def extract_url(xpath_results, base_url):
>>> f('', 'https://example.com') >>> f('', 'https://example.com')
raise lxml.etree.ParserError raise lxml.etree.ParserError
>>> searx.utils.extract_url([], 'https://example.com') >>> searx.utils.extract_url([], 'https://example.com')
raise Exception raise ValueError
Raises: Raises:
* Exception * ValueError
* lxml.etree.ParserError * lxml.etree.ParserError
Returns: Returns:
* str: normalized URL * str: normalized URL
""" """
if xpath_results == []: if xpath_results == []:
raise Exception('Empty url resultset') raise ValueError('Empty url resultset')
url = extract_text(xpath_results) url = extract_text(xpath_results)
return normalize_url(url, base_url) return normalize_url(url, base_url)
@ -256,25 +269,6 @@ def dict_subset(d, properties):
return result return result
def list_get(a_list, index, default=None):
"""Get element in list or default value
Examples:
>>> list_get(['A', 'B', 'C'], 0)
'A'
>>> list_get(['A', 'B', 'C'], 3)
None
>>> list_get(['A', 'B', 'C'], 3, 'default')
'default'
>>> list_get(['A', 'B', 'C'], -1)
'C'
"""
if len(a_list) > index:
return a_list[index]
else:
return default
def get_torrent_size(filesize, filesize_multiplier): def get_torrent_size(filesize, filesize_multiplier):
""" """
@ -310,7 +304,7 @@ def get_torrent_size(filesize, filesize_multiplier):
filesize = int(filesize * 1000 * 1000) filesize = int(filesize * 1000 * 1000)
elif filesize_multiplier == 'KiB': elif filesize_multiplier == 'KiB':
filesize = int(filesize * 1000) filesize = int(filesize * 1000)
except: except ValueError:
filesize = None filesize = None
return filesize return filesize
@ -506,20 +500,110 @@ def get_engine_from_settings(name):
return {} return {}
def get_xpath(xpath_str): def get_xpath(xpath_spec):
"""Return cached compiled XPath """Return cached compiled XPath
There is no thread lock. There is no thread lock.
Worst case scenario, xpath_str is compiled more than one time. Worst case scenario, xpath_str is compiled more than one time.
Args:
* xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
Returns:
* result (bool, float, list, str): Results.
Raises:
* TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
* SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
""" """
result = xpath_cache.get(xpath_str, None) if isinstance(xpath_spec, str):
result = xpath_cache.get(xpath_spec, None)
if result is None: if result is None:
result = XPath(xpath_str) try:
xpath_cache[xpath_str] = result result = XPath(xpath_spec)
except XPathSyntaxError as e:
raise SearxXPathSyntaxException(xpath_spec, str(e.msg))
xpath_cache[xpath_spec] = result
return result
if isinstance(xpath_spec, XPath):
return xpath_spec
raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
def eval_xpath(element, xpath_spec):
"""Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
See https://lxml.de/xpathxslt.html#xpath-return-values
Args:
* element (ElementBase): [description]
* xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
Returns:
* result (bool, float, list, str): Results.
Raises:
* TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
* SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
* SearxEngineXPathException: Raise when the XPath can't be evaluated.
"""
xpath = get_xpath(xpath_spec)
try:
return xpath(element)
except XPathError as e:
arg = ' '.join([str(i) for i in e.args])
raise SearxEngineXPathException(xpath_spec, arg)
def eval_xpath_list(element, xpath_spec, min_len=None):
"""Same as eval_xpath, check if the result is a list
Args:
* element (ElementBase): [description]
* xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
* min_len (int, optional): [description]. Defaults to None.
Raises:
* TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
* SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
* SearxEngineXPathException: raise if the result is not a list
Returns:
* result (bool, float, list, str): Results.
"""
result = eval_xpath(element, xpath_spec)
if not isinstance(result, list):
raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
if min_len is not None and min_len > len(result):
raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
return result return result
def eval_xpath(element, xpath_str): def eval_xpath_getindex(elements, xpath_spec, index, default=NOTSET):
"""Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.""" """Call eval_xpath_list then get one element using the index parameter.
xpath = get_xpath(xpath_str) If the index does not exist, either aise an exception is default is not set,
return xpath(element) other return the default value (can be None).
Args:
* elements (ElementBase): lxml element to apply the xpath.
* xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
* index (int): index to get
* default (Object, optional): Defaults if index doesn't exist.
Raises:
* TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
* SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
* SearxEngineXPathException: if the index is not found. Also see eval_xpath.
Returns:
* result (bool, float, list, str): Results.
"""
result = eval_xpath_list(elements, xpath_spec)
if index >= -len(result) and index < len(result):
return result[index]
if default == NOTSET:
# raise an SearxEngineXPathException instead of IndexError
# to record xpath_spec
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
return default

View file

@ -79,6 +79,7 @@ from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
from searx.answerers import answerers from searx.answerers import answerers
from searx.poolrequests import get_global_proxies from searx.poolrequests import get_global_proxies
from searx.metrology.error_recorder import errors_per_engines
# serve pages with HTTP/1.1 # serve pages with HTTP/1.1
@ -943,6 +944,34 @@ def stats():
) )
@app.route('/stats/errors', methods=['GET'])
def stats_errors():
result = {}
engine_names = list(errors_per_engines.keys())
engine_names.sort()
for engine_name in engine_names:
error_stats = errors_per_engines[engine_name]
sent_search_count = max(engines[engine_name].stats['sent_search_count'], 1)
sorted_context_count_list = sorted(error_stats.items(), key=lambda context_count: context_count[1])
r = []
percentage_sum = 0
for context, count in sorted_context_count_list:
percentage = round(20 * count / sent_search_count) * 5
percentage_sum += percentage
r.append({
'filename': context.filename,
'function': context.function,
'line_no': context.line_no,
'code': context.code,
'exception_classname': context.exception_classname,
'log_message': context.log_message,
'log_parameters': context.log_parameters,
'percentage': percentage,
})
result[engine_name] = sorted(r, reverse=True, key=lambda d: d['percentage'])
return jsonify(result)
@app.route('/robots.txt', methods=['GET']) @app.route('/robots.txt', methods=['GET'])
def robots(): def robots():
return Response("""User-agent: * return Response("""User-agent: *

View file

@ -3,6 +3,7 @@ import lxml.etree
from lxml import html from lxml import html
from searx.testing import SearxTestCase from searx.testing import SearxTestCase
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import utils from searx import utils
@ -57,8 +58,16 @@ class TestUtils(SearxTestCase):
dom = html.fromstring(html_str) dom = html.fromstring(html_str)
self.assertEqual(utils.extract_text(dom), 'Test text') self.assertEqual(utils.extract_text(dom), 'Test text')
self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text') self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
self.assertEqual(utils.extract_text(dom.xpath('//span/text()')), 'Test text')
self.assertEqual(utils.extract_text(dom.xpath('count(//span)')), '3.0')
self.assertEqual(utils.extract_text(dom.xpath('boolean(//span)')), 'True')
self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg') self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '') self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
self.assertEqual(utils.extract_text(None, allow_none=True), None)
with self.assertRaises(ValueError):
utils.extract_text(None)
with self.assertRaises(ValueError):
utils.extract_text({})
def test_extract_url(self): def test_extract_url(self):
def f(html_str, search_url): def f(html_str, search_url):
@ -136,3 +145,84 @@ class TestHTMLTextExtractor(SearxTestCase):
text = '<p><b>Lorem ipsum</i>dolor sit amet</p>' text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
with self.assertRaises(utils.HTMLTextExtractorException): with self.assertRaises(utils.HTMLTextExtractorException):
self.html_text_extractor.feed(text) self.html_text_extractor.feed(text)
class TestXPathUtils(SearxTestCase):
TEST_DOC = """<ul>
<li>Text in <b>bold</b> and <i>italic</i> </li>
<li>Another <b>text</b> <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs="></li>
</ul>"""
def test_get_xpath_cache(self):
xp1 = utils.get_xpath('//a')
xp2 = utils.get_xpath('//div')
xp3 = utils.get_xpath('//a')
self.assertEqual(id(xp1), id(xp3))
self.assertNotEqual(id(xp1), id(xp2))
def test_get_xpath_type(self):
utils.get_xpath(lxml.etree.XPath('//a'))
with self.assertRaises(TypeError):
utils.get_xpath([])
def test_get_xpath_invalid(self):
invalid_xpath = '//a[0].text'
with self.assertRaises(SearxXPathSyntaxException) as context:
utils.get_xpath(invalid_xpath)
self.assertEqual(context.exception.message, 'Invalid expression')
self.assertEqual(context.exception.xpath_str, invalid_xpath)
def test_eval_xpath_unregistered_function(self):
doc = html.fromstring(TestXPathUtils.TEST_DOC)
invalid_function_xpath = 'int(//a)'
with self.assertRaises(SearxEngineXPathException) as context:
utils.eval_xpath(doc, invalid_function_xpath)
self.assertEqual(context.exception.message, 'Unregistered function')
self.assertEqual(context.exception.xpath_str, invalid_function_xpath)
def test_eval_xpath(self):
doc = html.fromstring(TestXPathUtils.TEST_DOC)
self.assertEqual(utils.eval_xpath(doc, '//p'), [])
self.assertEqual(utils.eval_xpath(doc, '//i/text()'), ['italic'])
self.assertEqual(utils.eval_xpath(doc, 'count(//i)'), 1.0)
def test_eval_xpath_list(self):
doc = html.fromstring(TestXPathUtils.TEST_DOC)
# check a not empty list
self.assertEqual(utils.eval_xpath_list(doc, '//i/text()'), ['italic'])
# check min_len parameter
with self.assertRaises(SearxEngineXPathException) as context:
utils.eval_xpath_list(doc, '//p', min_len=1)
self.assertEqual(context.exception.message, 'len(xpath_str) < 1')
self.assertEqual(context.exception.xpath_str, '//p')
def test_eval_xpath_getindex(self):
doc = html.fromstring(TestXPathUtils.TEST_DOC)
# check index 0
self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 0), 'italic')
# default is 'something'
self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 1, default='something'), 'something')
# default is None
self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 1, default=None), None)
# index not found
with self.assertRaises(SearxEngineXPathException) as context:
utils.eval_xpath_getindex(doc, '//i/text()', 1)
self.assertEqual(context.exception.message, 'index 1 not found')
# not a list
with self.assertRaises(SearxEngineXPathException) as context:
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
self.assertEqual(context.exception.message, 'the result is not a list')