mirror of
https://github.com/searxng/searxng.git
synced 2024-11-26 04:41:00 +00:00
[mod] engine: wikimedia - improve results, add addition settings & doc
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
7d8c20c80d
commit
db522cf76d
4 changed files with 154 additions and 67 deletions
|
@ -40,6 +40,12 @@ Online Engines
|
||||||
|
|
||||||
demo/demo_online
|
demo/demo_online
|
||||||
xpath
|
xpath
|
||||||
|
mediawiki
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
:glob:
|
||||||
|
|
||||||
online/*
|
online/*
|
||||||
|
|
||||||
.. _offline engines:
|
.. _offline engines:
|
||||||
|
|
13
docs/dev/engines/mediawiki.rst
Normal file
13
docs/dev/engines/mediawiki.rst
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
.. _mediawiki engine:
|
||||||
|
|
||||||
|
================
|
||||||
|
MediaWiki Engine
|
||||||
|
================
|
||||||
|
|
||||||
|
.. contents::
|
||||||
|
:depth: 2
|
||||||
|
:local:
|
||||||
|
:backlinks: entry
|
||||||
|
|
||||||
|
.. automodule:: searx.engines.mediawiki
|
||||||
|
:members:
|
|
@ -1,18 +1,59 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""
|
# lint: pylint
|
||||||
General mediawiki-engine (Web)
|
"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
|
||||||
"""
|
the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
|
||||||
|
endpoints that follow this pattern::
|
||||||
|
|
||||||
from string import Formatter
|
https://{base_url}/w/api.php?action=query&list=search&format=json
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
In its actual state, this engine is implemented to parse JSON result
|
||||||
|
(`format=json`_) from a search query (`list=search`_). If you need other
|
||||||
|
``action`` and ``list`` types ask SearXNG developers to extend the
|
||||||
|
implementation according to your needs.
|
||||||
|
|
||||||
|
.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
|
||||||
|
.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
|
||||||
|
.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
|
||||||
|
.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
Request:
|
||||||
|
|
||||||
|
- :py:obj:`base_url`
|
||||||
|
- :py:obj:`search_type`
|
||||||
|
- :py:obj:`srenablerewrites`
|
||||||
|
- :py:obj:`srsort`
|
||||||
|
- :py:obj:`srprop`
|
||||||
|
|
||||||
|
Implementations
|
||||||
|
===============
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
from urllib.parse import urlencode, quote
|
from urllib.parse import urlencode, quote
|
||||||
|
|
||||||
from searx.utils import html_to_text
|
from searx.utils import html_to_text
|
||||||
|
from searx.enginelib.traits import EngineTraits
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger: logging.Logger
|
||||||
|
|
||||||
|
traits: EngineTraits
|
||||||
|
|
||||||
# about
|
# about
|
||||||
about = {
|
about = {
|
||||||
"website": None,
|
"website": None,
|
||||||
"wikidata_id": None,
|
"wikidata_id": None,
|
||||||
"official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search',
|
"official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
|
||||||
"use_official_api": True,
|
"use_official_api": True,
|
||||||
"require_api_key": False,
|
"require_api_key": False,
|
||||||
"results": 'JSON',
|
"results": 'JSON',
|
||||||
|
@ -21,73 +62,119 @@ about = {
|
||||||
# engine dependent config
|
# engine dependent config
|
||||||
categories = ['general']
|
categories = ['general']
|
||||||
paging = True
|
paging = True
|
||||||
number_of_results = 1
|
number_of_results = 5
|
||||||
search_type = 'nearmatch' # possible values: title, text, nearmatch
|
|
||||||
|
|
||||||
# search-url
|
search_type: str = 'nearmatch'
|
||||||
base_url = 'https://{language}.wikipedia.org/'
|
"""Which type of search to perform. One of the following values: ``nearmatch``,
|
||||||
search_postfix = (
|
``text`` or ``title``.
|
||||||
'w/api.php?action=query'
|
|
||||||
'&list=search'
|
See ``srwhat`` argument in `list=search`_ documentation.
|
||||||
'&{query}'
|
"""
|
||||||
'&format=json'
|
|
||||||
'&sroffset={offset}'
|
srenablerewrites: bool = True
|
||||||
'&srlimit={limit}'
|
"""Enable internal query rewriting (Type: boolean). Some search backends can
|
||||||
'&srwhat={searchtype}'
|
rewrite the query into another which is thought to provide better results, for
|
||||||
)
|
instance by correcting spelling errors.
|
||||||
|
|
||||||
|
See ``srenablerewrites`` argument in `list=search`_ documentation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
srsort: str = 'relevance'
|
||||||
|
"""Set the sort order of returned results. One of the following values:
|
||||||
|
``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
|
||||||
|
``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
|
||||||
|
``none``, ``random``, ``relevance``, ``user_random``.
|
||||||
|
|
||||||
|
See ``srenablerewrites`` argument in `list=search`_ documentation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
|
||||||
|
"""Which properties to return.
|
||||||
|
|
||||||
|
See ``srprop`` argument in `list=search`_ documentation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
base_url: str = 'https://{language}.wikipedia.org/'
|
||||||
|
"""Base URL of the Wikimedia wiki.
|
||||||
|
|
||||||
|
``{language}``:
|
||||||
|
ISO 639-1 language code (en, de, fr ..) of the search language.
|
||||||
|
"""
|
||||||
|
|
||||||
|
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
|
||||||
|
"""The longhand version of MediaWiki time strings."""
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
offset = (params['pageno'] - 1) * number_of_results
|
|
||||||
|
|
||||||
string_args = dict(
|
|
||||||
query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type
|
|
||||||
)
|
|
||||||
|
|
||||||
format_strings = list(Formatter().parse(base_url))
|
|
||||||
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en'
|
|
||||||
else:
|
|
||||||
language = params['language'].split('-')[0]
|
|
||||||
|
|
||||||
# format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
|
|
||||||
if any(x[1] == 'language' for x in format_strings):
|
|
||||||
string_args['language'] = language
|
|
||||||
|
|
||||||
# write search-language back to params, required in response
|
# write search-language back to params, required in response
|
||||||
params['language'] = language
|
|
||||||
|
|
||||||
search_url = base_url + search_postfix
|
if params['language'] == 'all':
|
||||||
|
params['language'] = 'en'
|
||||||
|
else:
|
||||||
|
params['language'] = params['language'].split('-')[0]
|
||||||
|
|
||||||
params['url'] = search_url.format(**string_args)
|
if base_url.endswith('/'):
|
||||||
|
api_url = base_url + 'w/api.php?'
|
||||||
|
else:
|
||||||
|
api_url = base_url + '/w/api.php?'
|
||||||
|
api_url = api_url.format(language=params['language'])
|
||||||
|
|
||||||
|
offset = (params['pageno'] - 1) * number_of_results
|
||||||
|
|
||||||
|
args = {
|
||||||
|
'action': 'query',
|
||||||
|
'list': 'search',
|
||||||
|
'format': 'json',
|
||||||
|
'srsearch': query,
|
||||||
|
'sroffset': offset,
|
||||||
|
'srlimit': number_of_results,
|
||||||
|
'srwhat': search_type,
|
||||||
|
'srprop': srprop,
|
||||||
|
'srsort': srsort,
|
||||||
|
}
|
||||||
|
if srenablerewrites:
|
||||||
|
args['srenablerewrites'] = '1'
|
||||||
|
|
||||||
|
params['url'] = api_url + urlencode(args)
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
# get response from search-request
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
|
||||||
|
|
||||||
|
results = []
|
||||||
search_results = resp.json()
|
search_results = resp.json()
|
||||||
|
|
||||||
# return empty array if there are no results
|
# return empty array if there are no results
|
||||||
if not search_results.get('query', {}).get('search'):
|
if not search_results.get('query', {}).get('search'):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# parse results
|
|
||||||
for result in search_results['query']['search']:
|
for result in search_results['query']['search']:
|
||||||
|
|
||||||
if result.get('snippet', '').startswith('#REDIRECT'):
|
if result.get('snippet', '').startswith('#REDIRECT'):
|
||||||
continue
|
continue
|
||||||
url = (
|
|
||||||
base_url.format(language=resp.search_params['language'])
|
|
||||||
+ 'wiki/'
|
|
||||||
+ quote(result['title'].replace(' ', '_').encode())
|
|
||||||
)
|
|
||||||
|
|
||||||
# append result
|
title = result['title']
|
||||||
results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))})
|
sectiontitle = result.get('sectiontitle')
|
||||||
|
content = html_to_text(result.get('snippet', ''))
|
||||||
|
metadata = html_to_text(result.get('categorysnippet', ''))
|
||||||
|
timestamp = result.get('timestamp')
|
||||||
|
|
||||||
|
url = (
|
||||||
|
base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
|
||||||
|
)
|
||||||
|
if sectiontitle:
|
||||||
|
# in case of sectiontitle create a link to the section in the wiki page
|
||||||
|
url += '#' + quote(sectiontitle.replace(' ', '_').encode())
|
||||||
|
title += ' / ' + sectiontitle
|
||||||
|
|
||||||
|
item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
|
||||||
|
|
||||||
|
if timestamp:
|
||||||
|
item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
|
||||||
|
|
||||||
|
results.append(item)
|
||||||
|
|
||||||
# return results
|
# return results
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -667,11 +667,6 @@ engines:
|
||||||
shortcut: fsd
|
shortcut: fsd
|
||||||
categories: [it, software wikis]
|
categories: [it, software wikis]
|
||||||
base_url: https://directory.fsf.org/
|
base_url: https://directory.fsf.org/
|
||||||
number_of_results: 5
|
|
||||||
# what part of a page matches the query string: title, text, nearmatch
|
|
||||||
# * title - query matches title
|
|
||||||
# * text - query matches the text of page
|
|
||||||
# * nearmatch - nearmatch in title
|
|
||||||
search_type: title
|
search_type: title
|
||||||
timeout: 5.0
|
timeout: 5.0
|
||||||
disabled: true
|
disabled: true
|
||||||
|
@ -1449,13 +1444,6 @@ engines:
|
||||||
engine: twitter
|
engine: twitter
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
# maybe in a fun category
|
|
||||||
# - name: uncyclopedia
|
|
||||||
# engine: mediawiki
|
|
||||||
# shortcut: unc
|
|
||||||
# base_url: https://uncyclopedia.wikia.com/
|
|
||||||
# number_of_results: 5
|
|
||||||
|
|
||||||
# tmp suspended - too slow, too many errors
|
# tmp suspended - too slow, too many errors
|
||||||
# - name: urbandictionary
|
# - name: urbandictionary
|
||||||
# engine : xpath
|
# engine : xpath
|
||||||
|
@ -1534,7 +1522,6 @@ engines:
|
||||||
shortcut: wb
|
shortcut: wb
|
||||||
categories: general
|
categories: general
|
||||||
base_url: "https://{language}.wikibooks.org/"
|
base_url: "https://{language}.wikibooks.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
disabled: true
|
disabled: true
|
||||||
about:
|
about:
|
||||||
|
@ -1546,9 +1533,9 @@ engines:
|
||||||
shortcut: wn
|
shortcut: wn
|
||||||
categories: news
|
categories: news
|
||||||
base_url: "https://{language}.wikinews.org/"
|
base_url: "https://{language}.wikinews.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
disabled: true
|
disabled: true
|
||||||
|
srsort: create_timestamp_desc
|
||||||
about:
|
about:
|
||||||
website: https://www.wikinews.org/
|
website: https://www.wikinews.org/
|
||||||
wikidata_id: Q964
|
wikidata_id: Q964
|
||||||
|
@ -1558,7 +1545,6 @@ engines:
|
||||||
shortcut: wq
|
shortcut: wq
|
||||||
categories: general
|
categories: general
|
||||||
base_url: "https://{language}.wikiquote.org/"
|
base_url: "https://{language}.wikiquote.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
disabled: true
|
disabled: true
|
||||||
additional_tests:
|
additional_tests:
|
||||||
|
@ -1572,7 +1558,6 @@ engines:
|
||||||
shortcut: ws
|
shortcut: ws
|
||||||
categories: general
|
categories: general
|
||||||
base_url: "https://{language}.wikisource.org/"
|
base_url: "https://{language}.wikisource.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
disabled: true
|
disabled: true
|
||||||
about:
|
about:
|
||||||
|
@ -1584,7 +1569,6 @@ engines:
|
||||||
shortcut: wsp
|
shortcut: wsp
|
||||||
categories: [general, science]
|
categories: [general, science]
|
||||||
base_url: "https://species.wikimedia.org/"
|
base_url: "https://species.wikimedia.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
disabled: true
|
disabled: true
|
||||||
about:
|
about:
|
||||||
|
@ -1596,7 +1580,6 @@ engines:
|
||||||
shortcut: wt
|
shortcut: wt
|
||||||
categories: [dictionaries]
|
categories: [dictionaries]
|
||||||
base_url: "https://{language}.wiktionary.org/"
|
base_url: "https://{language}.wiktionary.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
about:
|
about:
|
||||||
website: https://www.wiktionary.org/
|
website: https://www.wiktionary.org/
|
||||||
|
@ -1607,7 +1590,6 @@ engines:
|
||||||
shortcut: wv
|
shortcut: wv
|
||||||
categories: general
|
categories: general
|
||||||
base_url: "https://{language}.wikiversity.org/"
|
base_url: "https://{language}.wikiversity.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
disabled: true
|
disabled: true
|
||||||
about:
|
about:
|
||||||
|
@ -1619,7 +1601,6 @@ engines:
|
||||||
shortcut: wy
|
shortcut: wy
|
||||||
categories: general
|
categories: general
|
||||||
base_url: "https://{language}.wikivoyage.org/"
|
base_url: "https://{language}.wikivoyage.org/"
|
||||||
number_of_results: 5
|
|
||||||
search_type: text
|
search_type: text
|
||||||
disabled: true
|
disabled: true
|
||||||
about:
|
about:
|
||||||
|
|
Loading…
Reference in a new issue