From 591d9c2505e946a2a180e73fe3532f6335cbb955 Mon Sep 17 00:00:00 2001 From: Lucki Date: Sat, 11 Jan 2025 01:51:33 +0100 Subject: [PATCH] [json_engine] document existing options --- docs/dev/engines/index.rst | 1 + docs/dev/engines/json_engine.rst | 13 +++ searx/engines/json_engine.py | 136 ++++++++++++++++++++++++++++--- 3 files changed, 137 insertions(+), 13 deletions(-) create mode 100644 docs/dev/engines/json_engine.rst diff --git a/docs/dev/engines/index.rst b/docs/dev/engines/index.rst index 6558dc2c2..d0a31ab67 100644 --- a/docs/dev/engines/index.rst +++ b/docs/dev/engines/index.rst @@ -45,6 +45,7 @@ Online Engines demo/demo_online xpath mediawiki + json_engine .. toctree:: :maxdepth: 1 diff --git a/docs/dev/engines/json_engine.rst b/docs/dev/engines/json_engine.rst new file mode 100644 index 000000000..f1a48c6db --- /dev/null +++ b/docs/dev/engines/json_engine.rst @@ -0,0 +1,13 @@ +.. _json_engine engine: + +============ +JSON Engine +============ + +.. contents:: + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.engines.json_engine + :members: diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index 942f6ae8a..a84a21462 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -2,12 +2,58 @@ """The JSON engine is a *generic* engine with which it is possible to configure engines in the settings. -.. todo:: +Configuration +============= - - The JSON engine needs documentation!! +Request: - - The parameters of the JSON engine should be adapted to those of the XPath - engine. +- :py:obj:`search_url` +- :py:obj:`method` +- :py:obj:`request_body` +- :py:obj:`cookies` +- :py:obj:`headers` + +Paging: + +- :py:obj:`paging` +- :py:obj:`page_size` +- :py:obj:`first_page_num` + +Response: + +- :py:obj:`title_html_to_text` +- :py:obj:`content_html_to_text` + +JSON query: + +- :py:obj:`results_query` +- :py:obj:`url_query` +- :py:obj:`url_prefix` +- :py:obj:`title_query` +- :py:obj:`content_query` +- :py:obj:`suggestion_query` + + +Example +======= + +Here is a simple example of a JSON engine configure in the :ref:`settings +engine` section, further read :ref:`engines-dev`. + +.. code:: yaml + + - name : mdn + engine : json_engine + paging : True + search_url : https://developer.mozilla.org/api/v1/search?q={query}&page={pageno} + results_query : documents + url_query : mdn_url + url_prefix : https://developer.mozilla.org + title_query : title + content_query : summary + +Implementations +=============== """ @@ -16,34 +62,92 @@ from json import loads from urllib.parse import urlencode from searx.utils import to_string, html_to_text -# parameters for generating a request search_url = None +""" +Search URL of the engine. Example:: + + https://example.org/?search={query}&page={pageno} + +Replacements are: + +``{query}``: + Search terms from user. + +``{pageno}``: + Page number if engine supports paging :py:obj:`paging` + +""" + method = 'GET' +'''Some engines might require to do POST requests for search.''' + request_body = '' +'''The body of the request. This can only be used if different :py:obj:`method` +is set, e.g. ``POST``. For formatting see the documentation of :py:obj:`search_url`. + +Note: Curly brackets which aren't encapsulating a replacement placeholder +must be escaped by doubling each ``{`` and ``}``. + +.. code:: yaml + + request_body: >- + {{ + "search": "{query}", + "page": {pageno}, + "extra": {{ + "time_range": {time_range}, + "rating": "{safe_search}" + }} + }} +''' cookies = {} +'''Some engines might offer different result based on cookies. +Possible use-case: To set safesearch cookie.''' + headers = {} '''Some engines might offer different result based on cookies or headers. Possible use-case: To set safesearch cookie or header to moderate.''' paging = False -# parameters for engines with paging support -# -# number of results on each page -# (only needed if the site requires not a page number, but an offset) -page_size = 1 -# number of the first page (usually 0 or 1) -first_page_num = 1 +'''Engine supports paging [True or False].''' + +page_size = 1 +'''Number of results on each page. Only needed if the site requires not a page +number, but an offset.''' + +first_page_num = 1 +'''Number of the first page (usually 0 or 1).''' -# parameters for parsing the response results_query = '' +'''JSON query for the list of result items. + +The query string is a slash `/` separated path of JSON key names. +Array entries can be specified using the index or can be omitted entirely, +in which case each entry is considered - +most implementations will default to the first entry in this case. +''' + url_query = None +'''JSON query of result's ``url``. For the query string documentation see :py:obj:`results_query`''' + url_prefix = "" +'''String to prepend to the result's ``url``.''' + title_query = None +'''JSON query of result's ``title``. For the query string documentation see :py:obj:`results_query`''' + content_query = None +'''JSON query of result's ``content``. For the query string documentation see :py:obj:`results_query`''' + suggestion_query = '' +'''JSON query of result's ``suggestion``. For the query string documentation see :py:obj:`results_query`''' + title_html_to_text = False +'''Extract text from a HTML title string''' + content_html_to_text = False +'''Extract text from a HTML content string''' def iterate(iterable): @@ -102,6 +206,7 @@ def query(data, query_string): def request(query, params): # pylint: disable=redefined-outer-name + '''Build request parameters (see :ref:`engine request`).''' fp = {'query': urlencode({'q': query})[2:]} # pylint: disable=invalid-name if paging and search_url.find('{pageno}') >= 0: @@ -126,7 +231,12 @@ def identity(arg): def response(resp): + '''Scrap *results* from the response (see :ref:`engine results`).''' results = [] + + if not resp.text: + return results + json = loads(resp.text) title_filter = html_to_text if title_html_to_text else identity