[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now
using language `zh-classical` from wikipedia (and not `zh`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-10-28 19:12:59 +02:00
parent e0a6ca96cc
commit 858aa3e604
5 changed files with 275 additions and 2789 deletions

View file

@ -0,0 +1,27 @@
.. _wikimedia engines:
=========
Wikimedia
=========
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. _wikipedia engine:
Wikipedia
=========
.. automodule:: searx.engines.wikipedia
:members:
.. _wikidata engine:
Wikidata
=========
.. automodule:: searx.engines.wikidata
:members:

View file

@ -143,14 +143,31 @@ def qwant(query, sxng_locale):
return results return results
def wikipedia(query, lang): def wikipedia(query, sxng_locale):
# wikipedia autocompleter """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json' results = []
eng_traits = engines['wikipedia'].traits
wiki_lang = eng_traits.get_language(sxng_locale, 'en')
wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
resp = loads(get(url.format(urlencode(dict(search=query)))).text) url = 'https://{wiki_netloc}/w/api.php?{args}'
if len(resp) > 1: args = urlencode(
return resp[1] {
return [] 'action': 'opensearch',
'format': 'json',
'formatversion': '2',
'search': query,
'namespace': '0',
'limit': '10',
}
)
resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
if resp.ok:
data = resp.json()
if len(data) > 1:
results = data[1]
return results
def yandex(query, _lang): def yandex(query, _lang):

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,12 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Wikidata """This module implements the Wikidata engine. Some implementations are shared
from :ref:`wikipedia engine`.
""" """
# pylint: disable=missing-class-docstring # pylint: disable=missing-class-docstring
from typing import TYPE_CHECKING
from hashlib import md5 from hashlib import md5
from urllib.parse import urlencode, unquote from urllib.parse import urlencode, unquote
from json import loads from json import loads
@ -13,13 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
from searx.data import WIKIDATA_UNITS from searx.data import WIKIDATA_UNITS
from searx.network import post, get from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function from searx.utils import searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import ( # pylint: disable=unused-import from searx.engines.wikipedia import fetch_traits as _fetch_traits
fetch_traits, from searx.enginelib.traits import EngineTraits
_fetch_supported_languages,
supported_languages_url, if TYPE_CHECKING:
) import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -155,33 +162,35 @@ def send_wikidata_query(query, method='GET'):
def request(query, params): def request(query, params):
language = params['language'].split('-')[0]
if language == 'all': # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
language = 'en' # mapped to zh
else: sxng_lang = params['searxng_locale'].split('-')[0]
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] language = traits.get_language(sxng_lang, 'en')
query, attributes = get_query(query, language) query, attributes = get_query(query, language)
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
params['method'] = 'POST' params['method'] = 'POST'
params['url'] = SPARQL_ENDPOINT_URL params['url'] = SPARQL_ENDPOINT_URL
params['data'] = {'query': query} params['data'] = {'query': query}
params['headers'] = get_headers() params['headers'] = get_headers()
params['language'] = language params['language'] = language
params['attributes'] = attributes params['attributes'] = attributes
return params return params
def response(resp): def response(resp):
results = [] results = []
jsonresponse = loads(resp.content.decode()) jsonresponse = loads(resp.content.decode())
language = resp.search_params['language'].lower() language = resp.search_params['language']
attributes = resp.search_params['attributes'] attributes = resp.search_params['attributes']
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
seen_entities = set() seen_entities = set()
for result in jsonresponse.get('results', {}).get('bindings', []): for result in jsonresponse.get('results', {}).get('bindings', []):
attribute_result = {key: value['value'] for key, value in result.items()} attribute_result = {key: value['value'] for key, value in result.items()}
entity_url = attribute_result['item'] entity_url = attribute_result['item']
@ -757,3 +766,15 @@ def init(engine_settings=None): # pylint: disable=unused-argument
lang = result['name']['xml:lang'] lang = result['name']['xml:lang']
entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
def fetch_traits(engine_traits: EngineTraits):
"""Use languages evaluated from :py:obj:`wikipedia.fetch_traits
<searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
is not supported by wikidata."""
_fetch_traits(engine_traits)
# wikidata does not support zh-classical (zh_Hans)
engine_traits.languages.pop('zh_Hans')
# wikidata does not have net-locations for the languages
engine_traits.custom['wiki_netloc'] = {}

View file

@ -1,16 +1,26 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" # lint: pylint
Wikipedia (Web) """This module implements the Wikipedia engine. Some of this implementations
are shared by other engines:
- :ref:`wikidata engine`
The list of supported languages is fetched from the article linked by
:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia
does not support one Wikipedia for all the languages, but there is one Wikipedia
for every language (:py:obj:`fetch_traits`).
""" """
from urllib.parse import quote import urllib.parse
from json import loads import babel
from lxml import html from lxml import html
from searx.utils import match_language, searx_useragent
from searx import network from searx import network
from searx.locales import language_tag
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
engine_traits: EngineTraits traits: EngineTraits
# about # about
about = { about = {
@ -22,32 +32,40 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True send_accept_language_header = True
# search-url wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' """The *editing depth* of Wikipedia is one of several possible rough indicators
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' of the encyclopedia's collaborative quality, showing how frequently its articles
language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")} are updated. The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized.
"""
# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
the first paragraph of the page in plain text and HTML as well as the type of
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
and link previews in the apps.
.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
"""
# set language in base_url
def url_lang(lang):
lang_pre = lang.split('-')[0]
if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
return 'en'
return match_language(lang, supported_languages, language_aliases).split('-')[0]
# do search-request
def request(query, params): def request(query, params):
"""Assemble a request (`wikipedia rest_v1 summary API`_)."""
if query.islower(): if query.islower():
query = query.title() query = query.title()
language = url_lang(params['language']) engine_language = traits.get_language(params['searxng_locale'], 'en')
params['url'] = search_url.format(title=quote(query), language=language) wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
title = urllib.parse.quote(query)
# '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
# '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
params['headers']['User-Agent'] = searx_useragent()
params['raise_for_httperror'] = False params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2 params['soft_max_redirects'] = 2
@ -56,13 +74,14 @@ def request(query, params):
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = []
if resp.status_code == 404: if resp.status_code == 404:
return [] return []
if resp.status_code == 400: if resp.status_code == 400:
try: try:
api_result = loads(resp.text) api_result = resp.json()
except: except Exception: # pylint: disable=broad-except
pass pass
else: else:
if ( if (
@ -73,52 +92,25 @@ def response(resp):
network.raise_for_httperror(resp) network.raise_for_httperror(resp)
results = [] api_result = resp.json()
api_result = loads(resp.text)
# skip disambiguation pages
if api_result.get('type') != 'standard':
return []
title = api_result['title'] title = api_result['title']
wikipedia_link = api_result['content_urls']['desktop']['page'] wikipedia_link = api_result['content_urls']['desktop']['page']
results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
results.append({'url': wikipedia_link, 'title': title}) if api_result.get('type') == 'standard':
results.append(
results.append( {
{ 'infobox': title,
'infobox': title, 'id': wikipedia_link,
'id': wikipedia_link, 'content': api_result.get('extract', ''),
'content': api_result.get('extract', ''), 'img_src': api_result.get('thumbnail', {}).get('source'),
'img_src': api_result.get('thumbnail', {}).get('source'), 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}], }
} )
)
return results return results
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = {}
dom = html.fromstring(resp.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[1].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
# exclude languages with too few articles
if articles >= 100:
supported_languages[code] = {"name": name, "english_name": english_name}
return supported_languages
# Nonstandard language codes # Nonstandard language codes
# #
# These Wikipedias use language codes that do not conform to the ISO 639 # These Wikipedias use language codes that do not conform to the ISO 639
@ -135,104 +127,57 @@ lang_map = {
'nrm': 'nrf', 'nrm': 'nrf',
'roa-rup': 'rup', 'roa-rup': 'rup',
'nds-nl': 'nds', 'nds-nl': 'nds',
#'roa-tara: invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
#'simple: invented code used for the Simple English Wikipedia (not the official IETF code en-simple) #'simple: invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
'zh-classical': 'zh_Hant',
'zh-min-nan': 'nan', 'zh-min-nan': 'nan',
'zh-yue': 'yue', 'zh-yue': 'yue',
'an': 'arg', 'an': 'arg',
'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason)
} }
unknown_langs = [ unknown_langs = [
'ab', # Abkhazian
'alt', # Southern Altai
'an', # Aragonese 'an', # Aragonese
'ang', # Anglo-Saxon
'arc', # Aramaic
'ary', # Moroccan Arabic
'av', # Avar
'ba', # Bashkir 'ba', # Bashkir
'be-tarask',
'bar', # Bavarian 'bar', # Bavarian
'bcl', # Central Bicolano 'bcl', # Central Bicolano
'bh', # Bhojpuri 'be-tarask', # Belarusian variant / Belarusian is already covered by 'be'
'bi', # Bislama 'bpy', # Bishnupriya Manipuri is unknown by babel
'bjn', # Banjar
'blk', # Pa'O
'bpy', # Bishnupriya Manipuri
'bxr', # Buryat
'cbk-zam', # Zamboanga Chavacano
'co', # Corsican
'cu', # Old Church Slavonic
'dty', # Doteli
'dv', # Divehi
'ext', # Extremaduran
'fj', # Fijian
'frp', # Franco-Provençal
'gan', # Gan
'gom', # Goan Konkani
'hif', # Fiji Hindi 'hif', # Fiji Hindi
'ilo', # Ilokano 'ilo', # Ilokano
'inh', # Ingush
'jbo', # Lojban
'kaa', # Karakalpak
'kbd', # Kabardian Circassian
'kg', # Kongo
'koi', # Komi-Permyak
'krc', # Karachay-Balkar
'kv', # Komi
'lad', # Ladino
'lbe', # Lak
'lez', # Lezgian
'li', # Limburgish 'li', # Limburgish
'ltg', # Latgalian 'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
'mdf', # Moksha
'mnw', # Mon
'mwl', # Mirandese
'myv', # Erzya
'na', # Nauruan
'nah', # Nahuatl
'nov', # Novial
'nrm', # Norman
'pag', # Pangasinan
'pam', # Kapampangan
'pap', # Papiamentu
'pdc', # Pennsylvania German
'pfl', # Palatinate German
'roa-rup', # Aromanian
'sco', # Scots
'sco', # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
'sh', # Serbo-Croatian 'sh', # Serbo-Croatian
'simple', # simple english is not know as a natural language different to english (babel) 'simple', # simple english is not know as a natural language different to english (babel)
'sm', # Samoan
'srn', # Sranan
'stq', # Saterland Frisian
'szy', # Sakizaya
'tcy', # Tulu
'tet', # Tetum
'tpi', # Tok Pisin
'trv', # Seediq
'ty', # Tahitian
'tyv', # Tuvan
'udm', # Udmurt
'vep', # Vepsian
'vls', # West Flemish
'vo', # Volapük 'vo', # Volapük
'wa', # Walloon 'wa', # Walloon
'xal', # Kalmyk
] ]
def fetch_traits(engine_traits: EngineTraits): def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Wikipedia""" """Fetch languages from Wikipedia.
# pylint: disable=import-outside-toplevel
engine_traits.data_type = 'supported_languages' # deprecated The location of the Wikipedia address of a language is mapped in a
:py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
(``wiki_netloc``). Here is a reduced example:
import babel .. code:: python
from searx.locales import language_tag
resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias') traits.custom['wiki_netloc'] = {
"en": "en.wikipedia.org",
..
"gsw": "als.wikipedia.org",
..
"zh": "zh.wikipedia.org",
"zh-classical": "zh-classical.wikipedia.org"
}
"""
engine_traits.custom['wiki_netloc'] = {}
# insert alias to map from a region like zh-CN to a language zh_Hans
engine_traits.languages['zh_Hans'] = 'zh'
resp = network.get(wikipedia_article_depth)
if not resp.ok: if not resp.ok:
print("ERROR: response from Wikipedia is not OK.") print("ERROR: response from Wikipedia is not OK.")
@ -242,34 +187,31 @@ def fetch_traits(engine_traits: EngineTraits):
cols = row.xpath('./td') cols = row.xpath('./td')
if not cols: if not cols:
continue continue
cols = [c.text_content().strip() for c in cols] cols = [c.text_content().strip() for c in cols]
articles = int(cols[4].replace(',', '').replace('-', '0'))
users = int(cols[8].replace(',', '').replace('-', '0'))
depth = cols[11].strip('-')
if articles < 1000: depth = float(cols[3].replace('-', '0').replace(',', ''))
articles = int(cols[4].replace(',', '').replace(',', ''))
if articles < 10000:
# exclude languages with too few articles # exclude languages with too few articles
continue continue
# depth: rough indicator of a Wikipedias quality, showing how if int(depth) < 20:
# frequently its articles are updated. # Rough indicator of a Wikipedias quality, showing how frequently
if depth == '': # its articles are updated.
if users < 1000:
# depth is not calculated --> at least 1000 user should registered
continue
elif int(depth) < 20:
continue continue
eng_tag = cols[3] eng_tag = cols[2]
wiki_url = row.xpath('./td[3]/a/@href')[0]
wiki_url = urllib.parse.urlparse(wiki_url)
if eng_tag in unknown_langs: if eng_tag in unknown_langs:
continue continue
try: try:
sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag))) sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
except babel.UnknownLocaleError: except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag)) print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
continue continue
conflict = engine_traits.languages.get(sxng_tag) conflict = engine_traits.languages.get(sxng_tag)
@ -277,6 +219,6 @@ def fetch_traits(engine_traits: EngineTraits):
if conflict != eng_tag: if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue continue
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.languages['zh_Hans'] = 'zh' engine_traits.languages[sxng_tag] = eng_tag
engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc