[mod] Wikipedia: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-10-08 16:22:26 +02:00
parent f78f908383
commit 7daf4f95ef
3 changed files with 391 additions and 6 deletions

View file

@ -5121,7 +5121,116 @@
"all_locale": null,
"custom": {},
"data_type": "supported_languages",
"languages": {},
"languages": {
"af": "af",
"ak": "tw",
"am": "am",
"ar": "ar",
"as": "as",
"az": "az",
"be": "be",
"bg": "bg",
"bn": "bn",
"bo": "bo",
"bs": "bs",
"ca": "ca",
"chr": "chr",
"ckb": "ckb",
"cs": "cs",
"da": "da",
"de": "de",
"dsb": "dsb",
"el": "el",
"en": "en",
"es": "es",
"et": "et",
"fa": "fa",
"fi": "fi",
"fil": "tl",
"fo": "fo",
"fr": "fr",
"fur": "fur",
"fy": "fy",
"gl": "gl",
"gsw": "als",
"gu": "gu",
"gv": "gv",
"haw": "haw",
"he": "he",
"hi": "hi",
"hsb": "hsb",
"hu": "hu",
"hy": "hy",
"id": "id",
"is": "is",
"it": "it",
"ja": "ja",
"jv": "jv",
"ka": "ka",
"km": "km",
"kn": "kn",
"ko": "ko",
"ks": "ks",
"ksh": "ksh",
"kw": "kw",
"lb": "lb",
"lg": "lg",
"ln": "ln",
"lo": "lo",
"lt": "lt",
"lv": "lv",
"mai": "mai",
"mk": "mk",
"ml": "ml",
"mn": "mn",
"mr": "mr",
"ms": "ms",
"mt": "mt",
"nds": "nds-nl",
"ne": "ne",
"no": "no",
"om": "om",
"or": "or",
"os": "os",
"pa": "pa",
"pl": "pl",
"ps": "ps",
"pt": "pt",
"qu": "qu",
"rm": "rm",
"ro": "ro",
"ru": "ru",
"rw": "rw",
"sa": "sa",
"sah": "sah",
"sd": "sd",
"se": "se",
"shi": "shi",
"si": "si",
"sk": "sk",
"sl": "sl",
"smn": "smn",
"so": "so",
"sq": "sq",
"sr": "sr",
"ta": "ta",
"te": "te",
"th": "th",
"tk": "tk",
"to": "to",
"tr": "tr",
"ug": "ug",
"uk": "uk",
"ur": "ur",
"uz": "uz",
"vi": "vi",
"wo": "wo",
"xh": "xh",
"yi": "yi",
"zh": "zh",
"zh_Hans": "zh",
"zh_Hant": "zh-classical"
},
"regions": {},
"supported_languages": {
"ab": {
@ -6402,7 +6511,116 @@
"all_locale": null,
"custom": {},
"data_type": "supported_languages",
"languages": {},
"languages": {
"af": "af",
"ak": "tw",
"am": "am",
"ar": "ar",
"as": "as",
"az": "az",
"be": "be",
"bg": "bg",
"bn": "bn",
"bo": "bo",
"bs": "bs",
"ca": "ca",
"chr": "chr",
"ckb": "ckb",
"cs": "cs",
"da": "da",
"de": "de",
"dsb": "dsb",
"el": "el",
"en": "en",
"es": "es",
"et": "et",
"fa": "fa",
"fi": "fi",
"fil": "tl",
"fo": "fo",
"fr": "fr",
"fur": "fur",
"fy": "fy",
"gl": "gl",
"gsw": "als",
"gu": "gu",
"gv": "gv",
"haw": "haw",
"he": "he",
"hi": "hi",
"hsb": "hsb",
"hu": "hu",
"hy": "hy",
"id": "id",
"is": "is",
"it": "it",
"ja": "ja",
"jv": "jv",
"ka": "ka",
"km": "km",
"kn": "kn",
"ko": "ko",
"ks": "ks",
"ksh": "ksh",
"kw": "kw",
"lb": "lb",
"lg": "lg",
"ln": "ln",
"lo": "lo",
"lt": "lt",
"lv": "lv",
"mai": "mai",
"mk": "mk",
"ml": "ml",
"mn": "mn",
"mr": "mr",
"ms": "ms",
"mt": "mt",
"nds": "nds-nl",
"ne": "ne",
"no": "no",
"om": "om",
"or": "or",
"os": "os",
"pa": "pa",
"pl": "pl",
"ps": "ps",
"pt": "pt",
"qu": "qu",
"rm": "rm",
"ro": "ro",
"ru": "ru",
"rw": "rw",
"sa": "sa",
"sah": "sah",
"sd": "sd",
"se": "se",
"shi": "shi",
"si": "si",
"sk": "sk",
"sl": "sl",
"smn": "smn",
"so": "so",
"sq": "sq",
"sr": "sr",
"ta": "ta",
"te": "te",
"th": "th",
"tk": "tk",
"to": "to",
"tr": "tr",
"ug": "ug",
"uk": "uk",
"ur": "ur",
"uz": "uz",
"vi": "vi",
"wo": "wo",
"xh": "xh",
"yi": "yi",
"zh": "zh",
"zh_Hans": "zh",
"zh_Hant": "zh-classical"
},
"regions": {},
"supported_languages": {
"ab": {

View file

@ -16,6 +16,7 @@ from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import ( # pylint: disable=unused-import
fetch_traits,
_fetch_supported_languages,
supported_languages_url,
)

View file

@ -5,9 +5,12 @@
from urllib.parse import quote
from json import loads
from lxml.html import fromstring
from lxml import html
from searx.utils import match_language, searx_useragent
from searx.network import raise_for_httperror
from searx import network
from searx.enginelib.traits import EngineTraits
engine_traits: EngineTraits
# about
about = {
@ -68,7 +71,7 @@ def response(resp):
):
return []
raise_for_httperror(resp)
network.raise_for_httperror(resp)
results = []
api_result = loads(resp.text)
@ -98,7 +101,7 @@ def response(resp):
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = {}
dom = fromstring(resp.text)
dom = html.fromstring(resp.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
@ -114,3 +117,166 @@ def _fetch_supported_languages(resp):
supported_languages[code] = {"name": name, "english_name": english_name}
return supported_languages
# Nonstandard language codes
#
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).
lang_map = {
'be-tarask': 'bel',
'ak': 'aka',
'als': 'gsw',
'bat-smg': 'sgs',
'cbk-zam': 'cbk',
'fiu-vro': 'vro',
'map-bms': 'map',
'nrm': 'nrf',
'roa-rup': 'rup',
'nds-nl': 'nds',
#'roa-tara: invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
#'simple: invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
'zh-classical': 'zh_Hant',
'zh-min-nan': 'nan',
'zh-yue': 'yue',
'an': 'arg',
}
unknown_langs = [
'ab', # Abkhazian
'alt', # Southern Altai
'an', # Aragonese
'ang', # Anglo-Saxon
'arc', # Aramaic
'ary', # Moroccan Arabic
'av', # Avar
'ba', # Bashkir
'be-tarask',
'bar', # Bavarian
'bcl', # Central Bicolano
'bh', # Bhojpuri
'bi', # Bislama
'bjn', # Banjar
'blk', # Pa'O
'bpy', # Bishnupriya Manipuri
'bxr', # Buryat
'cbk-zam', # Zamboanga Chavacano
'co', # Corsican
'cu', # Old Church Slavonic
'dty', # Doteli
'dv', # Divehi
'ext', # Extremaduran
'fj', # Fijian
'frp', # Franco-Provençal
'gan', # Gan
'gom', # Goan Konkani
'hif', # Fiji Hindi
'ilo', # Ilokano
'inh', # Ingush
'jbo', # Lojban
'kaa', # Karakalpak
'kbd', # Kabardian Circassian
'kg', # Kongo
'koi', # Komi-Permyak
'krc', # Karachay-Balkar
'kv', # Komi
'lad', # Ladino
'lbe', # Lak
'lez', # Lezgian
'li', # Limburgish
'ltg', # Latgalian
'mdf', # Moksha
'mnw', # Mon
'mwl', # Mirandese
'myv', # Erzya
'na', # Nauruan
'nah', # Nahuatl
'nov', # Novial
'nrm', # Norman
'pag', # Pangasinan
'pam', # Kapampangan
'pap', # Papiamentu
'pdc', # Pennsylvania German
'pfl', # Palatinate German
'roa-rup', # Aromanian
'sco', # Scots
'sco', # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
'sh', # Serbo-Croatian
'simple', # simple english is not know as a natural language different to english (babel)
'sm', # Samoan
'srn', # Sranan
'stq', # Saterland Frisian
'szy', # Sakizaya
'tcy', # Tulu
'tet', # Tetum
'tpi', # Tok Pisin
'trv', # Seediq
'ty', # Tahitian
'tyv', # Tuvan
'udm', # Udmurt
'vep', # Vepsian
'vls', # West Flemish
'vo', # Volapük
'wa', # Walloon
'xal', # Kalmyk
]
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Wikipedia"""
# pylint: disable=import-outside-toplevel
engine_traits.data_type = 'supported_languages' # deprecated
import babel
from searx.locales import language_tag
resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
if not resp.ok:
print("ERROR: response from Wikipedia is not OK.")
dom = html.fromstring(resp.text)
for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
cols = row.xpath('./td')
if not cols:
continue
cols = [c.text_content().strip() for c in cols]
articles = int(cols[4].replace(',', '').replace('-', '0'))
users = int(cols[8].replace(',', '').replace('-', '0'))
depth = cols[11].strip('-')
if articles < 1000:
# exclude languages with too few articles
continue
# depth: rough indicator of a Wikipedias quality, showing how
# frequently its articles are updated.
if depth == '':
if users < 1000:
# depth is not calculated --> at least 1000 user should registered
continue
elif int(depth) < 20:
continue
eng_tag = cols[3]
if eng_tag in unknown_langs:
continue
try:
sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.languages['zh_Hans'] = 'zh'