searxng/searx/engines/json_engine.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""The JSON engine is a *generic* engine with which it is possible to configure
engines in the settings.

.. todo::

   - The JSON engine needs documentation!!

   - The parameters of the JSON engine should be adapted to those of the XPath
     engine.

"""

from collections.abc import Iterable
from json import loads
from urllib.parse import urlencode
from searx.utils import to_string, html_to_text


search_url = None
url_query = None
url_prefix = ""
content_query = None
title_query = None
content_html_to_text = False
title_html_to_text = False
paging = False
suggestion_query = ''
results_query = ''

cookies = {}
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''

# parameters for engines with paging support
#
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1


def iterate(iterable):
    if isinstance(iterable, dict):
        items = iterable.items()

    else:
        items = enumerate(iterable)
    for index, value in items:
        yield str(index), value


def is_iterable(obj):
    if isinstance(obj, str):
        return False
    return isinstance(obj, Iterable)


def parse(query):  # pylint: disable=redefined-outer-name
    q = []  # pylint: disable=invalid-name
    for part in query.split('/'):
        if part == '':
            continue
        q.append(part)
    return q


def do_query(data, q):  # pylint: disable=invalid-name
    ret = []
    if not q:
        return ret

    qkey = q[0]

    for key, value in iterate(data):

        if len(q) == 1:
            if key == qkey:
                ret.append(value)
            elif is_iterable(value):
                ret.extend(do_query(value, q))
        else:
            if not is_iterable(value):
                continue
            if key == qkey:
                ret.extend(do_query(value, q[1:]))
            else:
                ret.extend(do_query(value, q))
    return ret


def query(data, query_string):
    q = parse(query_string)

    return do_query(data, q)


def request(query, params):  # pylint: disable=redefined-outer-name
    query = urlencode({'q': query})[2:]

    fp = {'query': query}  # pylint: disable=invalid-name
    if paging and search_url.find('{pageno}') >= 0:
        fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num

    params['cookies'].update(cookies)
    params['headers'].update(headers)

    params['url'] = search_url.format(**fp)
    params['query'] = query

    return params


def identity(arg):
    return arg


def response(resp):
    results = []
    json = loads(resp.text)

    title_filter = html_to_text if title_html_to_text else identity
    content_filter = html_to_text if content_html_to_text else identity

    if results_query:
        rs = query(json, results_query)  # pylint: disable=invalid-name
        if not rs:
            return results
        for result in rs[0]:
            try:
                url = query(result, url_query)[0]
                title = query(result, title_query)[0]
            except:  # pylint: disable=bare-except
                continue
            try:
                content = query(result, content_query)[0]
            except:  # pylint: disable=bare-except
                content = ""
            results.append(
                {
                    'url': url_prefix + to_string(url),
                    'title': title_filter(to_string(title)),
                    'content': content_filter(to_string(content)),
                }
            )
    else:
        for result in json:
            url = query(result, url_query)[0]
            title = query(result, title_query)[0]
            content = query(result, content_query)[0]

            results.append(
                {
                    'url': url_prefix + to_string(url),
                    'title': title_filter(to_string(title)),
                    'content': content_filter(to_string(content)),
                }
            )

    if not suggestion_query:
        return results
    for suggestion in query(json, suggestion_query):
        results.append({'suggestion': suggestion})
    return results
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 10:31:25 +00:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`"""The JSON engine is a generic engine with which it is possible to configure`
			`engines in the settings.`

			`.. todo::`

			`- The JSON engine needs documentation!!`

			`- The parameters of the JSON engine should be adapted to those of the XPath`
			`engine.`

			`"""`
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 10:31:25 +00:00
[fix] drop Python 2: use collections.abc.Iterable instead of collections.Iterable 2020-10-05 10:52:08 +00:00			`from collections.abc import Iterable`
[enh] py3 compatibility 2016-11-30 17:43:03 +00:00			`from json import loads`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 15:42:46 +00:00			`from urllib.parse import urlencode`
[mod] json_engine: add content_html_to_text and title_html_to_text Some JSON API returns HTML in either in the HTML or the content. This commit adds two new parameters to the json_engine: content_html_to_text and title_html_to_text, False by default. If True, then the searx.utils.html_to_text removes the HTML tags. Update crossref, openairedatasets and openairepublications engines 2021-02-10 15:40:03 +00:00			`from searx.utils import to_string, html_to_text`
[enh] py3 compatibility 2016-11-30 17:43:03 +00:00
[enh] json engine added 2013-11-19 14:49:52 +00:00
[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00			`search_url = None`
			`url_query = None`
[feat] engine: implementation of mdn 2023-10-20 09:26:28 +00:00			`url_prefix = ""`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`content_query = None`
[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00			`title_query = None`
[mod] json_engine: add content_html_to_text and title_html_to_text Some JSON API returns HTML in either in the HTML or the content. This commit adds two new parameters to the json_engine: content_html_to_text and title_html_to_text, False by default. If True, then the searx.utils.html_to_text removes the HTML tags. Update crossref, openairedatasets and openairepublications engines 2021-02-10 15:40:03 +00:00			`content_html_to_text = False`
			`title_html_to_text = False`
[enh] py3 compatibility 2016-11-30 17:43:03 +00:00			`paging = False`
Add Microsoft Academic search engine 2016-08-12 10:15:27 +00:00			`suggestion_query = ''`
			`results_query = ''`
[enh] json engine added 2013-11-19 14:49:52 +00:00
[enh] Initial no paging support for Yep.com Upstream example query: https://yep.com/web?q=test https://yep.com/about 2022-06-10 21:26:55 +00:00			`cookies = {}`
			`headers = {}`
			`'''Some engines might offer different result based on cookies or headers.`
			`Possible use-case: To set safesearch cookie or header to moderate.'''`

Add Crossref search engine and DOAI rewrite plugin 2016-07-16 09:26:29 +00:00			`# parameters for engines with paging support`
			`#`
			`# number of results on each page`
			`# (only needed if the site requires not a page number, but an offset)`
			`page_size = 1`
			`# number of the first page (usually 0 or 1)`
			`first_page_num = 1`

[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00
[enh] json engine added 2013-11-19 14:49:52 +00:00			`def iterate(iterable):`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`if isinstance(iterable, dict):`
			`items = iterable.items()`
[enh] json engine added 2013-11-19 14:49:52 +00:00
			`else:`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`items = enumerate(iterable)`
			`for index, value in items:`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`yield str(index), value`

[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00
[enh] json engine added 2013-11-19 14:49:52 +00:00			`def is_iterable(obj):`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`if isinstance(obj, str):`
[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00			`return False`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`return isinstance(obj, Iterable)`

[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`def parse(query): # pylint: disable=redefined-outer-name`
			`q = [] # pylint: disable=invalid-name`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`for part in query.split('/'):`
			`if part == '':`
			`continue`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`q.append(part)`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`return q`

[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`def do_query(data, q): # pylint: disable=invalid-name`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`ret = []`
[mod] len() removed from conditions 2014-02-11 12:13:51 +00:00			`if not q:`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`return ret`

			`qkey = q[0]`

[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00			`for key, value in iterate(data):`
[enh] json engine added 2013-11-19 14:49:52 +00:00
			`if len(q) == 1:`
			`if key == qkey:`
			`ret.append(value)`
			`elif is_iterable(value):`
			`ret.extend(do_query(value, q))`
			`else:`
			`if not is_iterable(value):`
			`continue`
			`if key == qkey:`
			`ret.extend(do_query(value, q[1:]))`
			`else:`
			`ret.extend(do_query(value, q))`
			`return ret`

[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00
[enh] json engine added 2013-11-19 14:49:52 +00:00			`def query(data, query_string):`
			`q = parse(query_string)`

			`return do_query(data, q)`

[fix] pep/flake8 compatibility 2014-01-20 01:31:20 +00:00
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`def request(query, params): # pylint: disable=redefined-outer-name`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`query = urlencode({'q': query})[2:]`
Add Crossref search engine and DOAI rewrite plugin 2016-07-16 09:26:29 +00:00
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`fp = {'query': query} # pylint: disable=invalid-name`
Add Crossref search engine and DOAI rewrite plugin 2016-07-16 09:26:29 +00:00			`if paging and search_url.find('{pageno}') >= 0:`
[fix] behaviour for page_size>1 and first_page_num>0 eg. pageno=1,21,41,... instead of 20,40,60,... 2016-08-14 11:46:54 +00:00			`fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num`
Add Crossref search engine and DOAI rewrite plugin 2016-07-16 09:26:29 +00:00
[enh] Initial no paging support for Yep.com Upstream example query: https://yep.com/web?q=test https://yep.com/about 2022-06-10 21:26:55 +00:00			`params['cookies'].update(cookies)`
			`params['headers'].update(headers)`

Add Crossref search engine and DOAI rewrite plugin 2016-07-16 09:26:29 +00:00			`params['url'] = search_url.format(**fp)`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`params['query'] = query`
Add Crossref search engine and DOAI rewrite plugin 2016-07-16 09:26:29 +00:00
[enh] json engine added 2013-11-19 14:49:52 +00:00			`return params`


[mod] json_engine: add content_html_to_text and title_html_to_text Some JSON API returns HTML in either in the HTML or the content. This commit adds two new parameters to the json_engine: content_html_to_text and title_html_to_text, False by default. If True, then the searx.utils.html_to_text removes the HTML tags. Update crossref, openairedatasets and openairepublications engines 2021-02-10 15:40:03 +00:00			`def identity(arg):`
			`return arg`


[enh] json engine added 2013-11-19 14:49:52 +00:00			`def response(resp):`
			`results = []`
			`json = loads(resp.text)`
[mod] json_engine: add content_html_to_text and title_html_to_text Some JSON API returns HTML in either in the HTML or the content. This commit adds two new parameters to the json_engine: content_html_to_text and title_html_to_text, False by default. If True, then the searx.utils.html_to_text removes the HTML tags. Update crossref, openairedatasets and openairepublications engines 2021-02-10 15:40:03 +00:00
			`title_filter = html_to_text if title_html_to_text else identity`
			`content_filter = html_to_text if content_html_to_text else identity`

Add Microsoft Academic search engine 2016-08-12 10:15:27 +00:00			`if results_query:`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`rs = query(json, results_query) # pylint: disable=invalid-name`
			`if not rs:`
[mod] add more error handling to json engine 2017-11-01 23:43:29 +00:00			`return results`
			`for result in rs[0]:`
[mod] add more error handling to json engine II. 2017-11-02 00:08:15 +00:00			`try:`
			`url = query(result, url_query)[0]`
			`title = query(result, title_query)[0]`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`except: # pylint: disable=bare-except`
[mod] add more error handling to json engine II. 2017-11-02 00:08:15 +00:00			`continue`
[mod] add more error handling to json engine 2017-11-01 23:43:29 +00:00			`try:`
			`content = query(result, content_query)[0]`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 06:45:08 +00:00			`except: # pylint: disable=bare-except`
[mod] add more error handling to json engine 2017-11-01 23:43:29 +00:00			`content = ""`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 08:26:22 +00:00			`results.append(`
			`{`
[feat] engine: implementation of mdn 2023-10-20 09:26:28 +00:00			`'url': url_prefix + to_string(url),`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 08:26:22 +00:00			`'title': title_filter(to_string(title)),`
			`'content': content_filter(to_string(content)),`
			`}`
			`)`
Add Microsoft Academic search engine 2016-08-12 10:15:27 +00:00			`else:`
[fix] json_engine: Fix result fields being mixed up Fixes #3810. 2024-09-12 08:10:20 +00:00			`for result in json:`
			`url = query(result, url_query)[0]`
			`title = query(result, title_query)[0]`
			`content = query(result, content_query)[0]`

[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 08:26:22 +00:00			`results.append(`
			`{`
[feat] engine: implementation of mdn 2023-10-20 09:26:28 +00:00			`'url': url_prefix + to_string(url),`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 08:26:22 +00:00			`'title': title_filter(to_string(title)),`
			`'content': content_filter(to_string(content)),`
			`}`
			`)`
Add Microsoft Academic search engine 2016-08-12 10:15:27 +00:00
			`if not suggestion_query:`
			`return results`
			`for suggestion in query(json, suggestion_query):`
			`results.append({'suggestion': suggestion})`
[enh] json engine added 2013-11-19 14:49:52 +00:00			`return results`