searxng/searx/engines/json_engine.py

174 lines
4.5 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""The JSON engine is a *generic* engine with which it is possible to configure
engines in the settings.
.. todo::
- The JSON engine needs documentation!!
- The parameters of the JSON engine should be adapted to those of the XPath
engine.
"""
from collections.abc import Iterable
from json import loads
from urllib.parse import urlencode
from searx.utils import to_string, html_to_text
# parameters for generating a request
search_url = None
method = 'GET'
request_body = ''
cookies = {}
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''
paging = False
# parameters for engines with paging support
#
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1
# parameters for parsing the response
results_query = ''
url_query = None
url_prefix = ""
title_query = None
content_query = None
suggestion_query = ''
title_html_to_text = False
content_html_to_text = False
def iterate(iterable):
if isinstance(iterable, dict):
items = iterable.items()
else:
items = enumerate(iterable)
for index, value in items:
yield str(index), value
def is_iterable(obj):
if isinstance(obj, str):
return False
return isinstance(obj, Iterable)
def parse(query): # pylint: disable=redefined-outer-name
q = [] # pylint: disable=invalid-name
for part in query.split('/'):
if part == '':
continue
q.append(part)
return q
def do_query(data, q): # pylint: disable=invalid-name
ret = []
if not q:
return ret
qkey = q[0]
for key, value in iterate(data):
if len(q) == 1:
if key == qkey:
ret.append(value)
elif is_iterable(value):
ret.extend(do_query(value, q))
else:
if not is_iterable(value):
continue
if key == qkey:
ret.extend(do_query(value, q[1:]))
else:
ret.extend(do_query(value, q))
return ret
def query(data, query_string):
q = parse(query_string)
return do_query(data, q)
def request(query, params): # pylint: disable=redefined-outer-name
fp = {'query': urlencode({'q': query})[2:]} # pylint: disable=invalid-name
if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['cookies'].update(cookies)
params['headers'].update(headers)
params['url'] = search_url.format(**fp)
params['method'] = method
if request_body:
# don't url-encode the query if it's in the request body
fp['query'] = query
params['data'] = request_body.format(**fp)
return params
def identity(arg):
return arg
def response(resp):
results = []
json = loads(resp.text)
title_filter = html_to_text if title_html_to_text else identity
content_filter = html_to_text if content_html_to_text else identity
if results_query:
rs = query(json, results_query) # pylint: disable=invalid-name
if not rs:
return results
for result in rs[0]:
try:
url = query(result, url_query)[0]
title = query(result, title_query)[0]
except: # pylint: disable=bare-except
continue
try:
content = query(result, content_query)[0]
except: # pylint: disable=bare-except
content = ""
results.append(
{
'url': url_prefix + to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}
)
else:
for result in json:
url = query(result, url_query)[0]
title = query(result, title_query)[0]
content = query(result, content_query)[0]
results.append(
{
'url': url_prefix + to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}
)
if not suggestion_query:
return results
for suggestion in query(json, suggestion_query):
results.append({'suggestion': suggestion})
return results