From db522cf76d26e6c871ae5c012787985685e82c17 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 4 Aug 2023 16:54:22 +0200 Subject: [PATCH] [mod] engine: wikimedia - improve results, add addition settings & doc Signed-off-by: Markus Heiser --- docs/dev/engines/index.rst | 6 ++ docs/dev/engines/mediawiki.rst | 13 +++ searx/engines/mediawiki.py | 181 ++++++++++++++++++++++++--------- searx/settings.yml | 21 +--- 4 files changed, 154 insertions(+), 67 deletions(-) create mode 100644 docs/dev/engines/mediawiki.rst diff --git a/docs/dev/engines/index.rst b/docs/dev/engines/index.rst index ca8f87b3c..daa12f202 100644 --- a/docs/dev/engines/index.rst +++ b/docs/dev/engines/index.rst @@ -40,6 +40,12 @@ Online Engines demo/demo_online xpath + mediawiki + +.. toctree:: + :maxdepth: 1 + :glob: + online/* .. _offline engines: diff --git a/docs/dev/engines/mediawiki.rst b/docs/dev/engines/mediawiki.rst new file mode 100644 index 000000000..ce708f95b --- /dev/null +++ b/docs/dev/engines/mediawiki.rst @@ -0,0 +1,13 @@ +.. _mediawiki engine: + +================ +MediaWiki Engine +================ + +.. contents:: + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.engines.mediawiki + :members: diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 27ce36e87..6a9ac974a 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -1,18 +1,59 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -""" - General mediawiki-engine (Web) -""" +# lint: pylint +"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by +the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have +endpoints that follow this pattern:: -from string import Formatter + https://{base_url}/w/api.php?action=query&list=search&format=json + +.. note:: + + In its actual state, this engine is implemented to parse JSON result + (`format=json`_) from a search query (`list=search`_). If you need other + ``action`` and ``list`` types ask SearXNG developers to extend the + implementation according to your needs. + +.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page +.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query +.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch +.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json + +Configuration +============= + +Request: + +- :py:obj:`base_url` +- :py:obj:`search_type` +- :py:obj:`srenablerewrites` +- :py:obj:`srsort` +- :py:obj:`srprop` + +Implementations +=============== + +""" +from __future__ import annotations +from typing import TYPE_CHECKING + +from datetime import datetime from urllib.parse import urlencode, quote from searx.utils import html_to_text +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits # about about = { "website": None, "wikidata_id": None, - "official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search', + "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query', "use_official_api": True, "require_api_key": False, "results": 'JSON', @@ -21,73 +62,119 @@ about = { # engine dependent config categories = ['general'] paging = True -number_of_results = 1 -search_type = 'nearmatch' # possible values: title, text, nearmatch +number_of_results = 5 -# search-url -base_url = 'https://{language}.wikipedia.org/' -search_postfix = ( - 'w/api.php?action=query' - '&list=search' - '&{query}' - '&format=json' - '&sroffset={offset}' - '&srlimit={limit}' - '&srwhat={searchtype}' -) +search_type: str = 'nearmatch' +"""Which type of search to perform. One of the following values: ``nearmatch``, +``text`` or ``title``. + +See ``srwhat`` argument in `list=search`_ documentation. +""" + +srenablerewrites: bool = True +"""Enable internal query rewriting (Type: boolean). Some search backends can +rewrite the query into another which is thought to provide better results, for +instance by correcting spelling errors. + +See ``srenablerewrites`` argument in `list=search`_ documentation. +""" + +srsort: str = 'relevance' +"""Set the sort order of returned results. One of the following values: +``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``, +``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``, +``none``, ``random``, ``relevance``, ``user_random``. + +See ``srenablerewrites`` argument in `list=search`_ documentation. +""" + +srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet' +"""Which properties to return. + +See ``srprop`` argument in `list=search`_ documentation. +""" + +base_url: str = 'https://{language}.wikipedia.org/' +"""Base URL of the Wikimedia wiki. + +``{language}``: + ISO 639-1 language code (en, de, fr ..) of the search language. +""" + +timestamp_format = '%Y-%m-%dT%H:%M:%SZ' +"""The longhand version of MediaWiki time strings.""" -# do search-request def request(query, params): - offset = (params['pageno'] - 1) * number_of_results - - string_args = dict( - query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type - ) - - format_strings = list(Formatter().parse(base_url)) - - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('-')[0] - - # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] - if any(x[1] == 'language' for x in format_strings): - string_args['language'] = language # write search-language back to params, required in response - params['language'] = language - search_url = base_url + search_postfix + if params['language'] == 'all': + params['language'] = 'en' + else: + params['language'] = params['language'].split('-')[0] - params['url'] = search_url.format(**string_args) + if base_url.endswith('/'): + api_url = base_url + 'w/api.php?' + else: + api_url = base_url + '/w/api.php?' + api_url = api_url.format(language=params['language']) + offset = (params['pageno'] - 1) * number_of_results + + args = { + 'action': 'query', + 'list': 'search', + 'format': 'json', + 'srsearch': query, + 'sroffset': offset, + 'srlimit': number_of_results, + 'srwhat': search_type, + 'srprop': srprop, + 'srsort': srsort, + } + if srenablerewrites: + args['srenablerewrites'] = '1' + + params['url'] = api_url + urlencode(args) return params # get response from search-request def response(resp): - results = [] + results = [] search_results = resp.json() # return empty array if there are no results if not search_results.get('query', {}).get('search'): return [] - # parse results for result in search_results['query']['search']: + if result.get('snippet', '').startswith('#REDIRECT'): continue - url = ( - base_url.format(language=resp.search_params['language']) - + 'wiki/' - + quote(result['title'].replace(' ', '_').encode()) - ) - # append result - results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))}) + title = result['title'] + sectiontitle = result.get('sectiontitle') + content = html_to_text(result.get('snippet', '')) + metadata = html_to_text(result.get('categorysnippet', '')) + timestamp = result.get('timestamp') + + url = ( + base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode()) + ) + if sectiontitle: + # in case of sectiontitle create a link to the section in the wiki page + url += '#' + quote(sectiontitle.replace(' ', '_').encode()) + title += ' / ' + sectiontitle + + item = {'url': url, 'title': title, 'content': content, 'metadata': metadata} + + if timestamp: + item['publishedDate'] = datetime.strptime(timestamp, timestamp_format) + + results.append(item) # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index e498a9ea2..bc9f5a29d 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -667,11 +667,6 @@ engines: shortcut: fsd categories: [it, software wikis] base_url: https://directory.fsf.org/ - number_of_results: 5 - # what part of a page matches the query string: title, text, nearmatch - # * title - query matches title - # * text - query matches the text of page - # * nearmatch - nearmatch in title search_type: title timeout: 5.0 disabled: true @@ -1449,13 +1444,6 @@ engines: engine: twitter disabled: true - # maybe in a fun category - # - name: uncyclopedia - # engine: mediawiki - # shortcut: unc - # base_url: https://uncyclopedia.wikia.com/ - # number_of_results: 5 - # tmp suspended - too slow, too many errors # - name: urbandictionary # engine : xpath @@ -1534,7 +1522,6 @@ engines: shortcut: wb categories: general base_url: "https://{language}.wikibooks.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1546,9 +1533,9 @@ engines: shortcut: wn categories: news base_url: "https://{language}.wikinews.org/" - number_of_results: 5 search_type: text disabled: true + srsort: create_timestamp_desc about: website: https://www.wikinews.org/ wikidata_id: Q964 @@ -1558,7 +1545,6 @@ engines: shortcut: wq categories: general base_url: "https://{language}.wikiquote.org/" - number_of_results: 5 search_type: text disabled: true additional_tests: @@ -1572,7 +1558,6 @@ engines: shortcut: ws categories: general base_url: "https://{language}.wikisource.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1584,7 +1569,6 @@ engines: shortcut: wsp categories: [general, science] base_url: "https://species.wikimedia.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1596,7 +1580,6 @@ engines: shortcut: wt categories: [dictionaries] base_url: "https://{language}.wiktionary.org/" - number_of_results: 5 search_type: text about: website: https://www.wiktionary.org/ @@ -1607,7 +1590,6 @@ engines: shortcut: wv categories: general base_url: "https://{language}.wikiversity.org/" - number_of_results: 5 search_type: text disabled: true about: @@ -1619,7 +1601,6 @@ engines: shortcut: wy categories: general base_url: "https://{language}.wikivoyage.org/" - number_of_results: 5 search_type: text disabled: true about: