mirror of
https://github.com/searxng/searxng.git
synced 2025-01-23 16:48:06 +00:00
8b6a3f3e11
Improve region and language detection / all locale Testing has shown the following behaviour for the different default and empty values of Mojeeks parameters: | param | idx | value | behaviour | | -------- | --- | ------ | ------------------------- | | region | 0 | '' | detect region based on IP | | region | 1 | 'none' | all regions | | language | 0 | '' | all languages |
199 lines
5.9 KiB
Python
Executable file
199 lines
5.9 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
|
|
|
|
:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
|
|
Persistence of engines traits, fetched from the engines.
|
|
|
|
:origin:`searx/languages.py`
|
|
Is generated from intersecting each engine's supported traits.
|
|
|
|
The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
|
|
the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
|
|
|
|
"""
|
|
|
|
# pylint: disable=invalid-name
|
|
from unicodedata import lookup
|
|
from pathlib import Path
|
|
from pprint import pformat
|
|
import babel
|
|
|
|
from searx import settings, searx_dir
|
|
from searx import network
|
|
from searx.engines import load_engines
|
|
from searx.enginelib.traits import EngineTraitsMap
|
|
|
|
# Output files.
|
|
languages_file = Path(searx_dir) / 'sxng_locales.py'
|
|
languages_file_header = """\
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
'''List of SearXNG's locale codes.
|
|
|
|
.. hint::
|
|
|
|
Don't modify this file, this file is generated by::
|
|
|
|
./manage data.traits
|
|
'''
|
|
|
|
sxng_locales = (
|
|
"""
|
|
languages_file_footer = """,
|
|
)
|
|
'''
|
|
A list of five-digit tuples:
|
|
|
|
0. SearXNG's internal locale tag (a language or region tag)
|
|
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
|
|
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
|
|
Empty string for language tags.
|
|
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
|
|
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
|
|
are represented by a globe (\U0001F310)
|
|
|
|
.. code:: python
|
|
|
|
('en', 'English', '', 'English', '\U0001f310'),
|
|
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
|
|
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
|
|
..
|
|
('fr', 'Français', '', 'French', '\U0001f310'),
|
|
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
|
|
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
|
|
|
:meta hide-value:
|
|
'''
|
|
"""
|
|
|
|
|
|
lang2emoji = {
|
|
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
|
|
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
|
|
'jp': '\U0001F1EF\U0001F1F5', # Japanese
|
|
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
|
|
'he': '\U0001F1EE\U0001F1F1', # Hebrew
|
|
}
|
|
|
|
|
|
def main():
|
|
load_engines(settings['engines'])
|
|
# traits_map = EngineTraitsMap.from_data()
|
|
traits_map = fetch_traits_map()
|
|
sxng_tag_list = filter_locales(traits_map)
|
|
write_languages_file(sxng_tag_list)
|
|
|
|
|
|
def fetch_traits_map():
|
|
"""Fetchs supported languages for each engine and writes json file with those."""
|
|
network.set_timeout_for_thread(10.0)
|
|
|
|
def log(msg):
|
|
print(msg)
|
|
|
|
traits_map = EngineTraitsMap.fetch_traits(log=log)
|
|
print("fetched properties from %s engines" % len(traits_map))
|
|
print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
|
|
traits_map.save_data()
|
|
return traits_map
|
|
|
|
|
|
def filter_locales(traits_map: EngineTraitsMap):
|
|
"""Filter language & region tags by a threshold."""
|
|
|
|
min_eng_per_region = 18
|
|
min_eng_per_lang = 20
|
|
|
|
_ = {}
|
|
for eng in traits_map.values():
|
|
for reg in eng.regions.keys():
|
|
_[reg] = _.get(reg, 0) + 1
|
|
|
|
regions = set(k for k, v in _.items() if v >= min_eng_per_region)
|
|
lang_from_region = set(k.split('-')[0] for k in regions)
|
|
|
|
_ = {}
|
|
for eng in traits_map.values():
|
|
for lang in eng.languages.keys():
|
|
# ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
|
|
# already counted by existence of 'zh' or 'sr', 'pa')
|
|
if '_' in lang:
|
|
# print("ignore %s" % lang)
|
|
continue
|
|
_[lang] = _.get(lang, 0) + 1
|
|
|
|
languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
|
|
|
|
sxng_tag_list = set()
|
|
sxng_tag_list.update(regions)
|
|
sxng_tag_list.update(lang_from_region)
|
|
sxng_tag_list.update(languages)
|
|
|
|
return sxng_tag_list
|
|
|
|
|
|
def write_languages_file(sxng_tag_list):
|
|
|
|
language_codes = []
|
|
|
|
for sxng_tag in sorted(sxng_tag_list):
|
|
sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
|
|
|
|
flag = get_unicode_flag(sxng_locale) or ''
|
|
|
|
item = (
|
|
sxng_tag,
|
|
sxng_locale.get_language_name().title(), # type: ignore
|
|
sxng_locale.get_territory_name() or '',
|
|
sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '',
|
|
UnicodeEscape(flag),
|
|
)
|
|
|
|
language_codes.append(item)
|
|
|
|
language_codes = tuple(language_codes)
|
|
|
|
with languages_file.open('w', encoding='utf-8') as new_file:
|
|
file_content = "{header} {language_codes}{footer}".format(
|
|
header=languages_file_header,
|
|
language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
|
|
footer=languages_file_footer,
|
|
)
|
|
new_file.write(file_content)
|
|
new_file.close()
|
|
|
|
|
|
class UnicodeEscape(str):
|
|
"""Escape unicode string in :py:obj:`pprint.pformat`"""
|
|
|
|
def __repr__(self):
|
|
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
|
|
|
|
|
|
def get_unicode_flag(locale: babel.Locale):
|
|
"""Determine a unicode flag (emoji) that fits to the ``locale``"""
|
|
|
|
emoji = lang2emoji.get(locale.language)
|
|
if emoji:
|
|
return emoji
|
|
|
|
if not locale.territory:
|
|
return '\U0001F310'
|
|
|
|
emoji = lang2emoji.get(locale.territory.lower())
|
|
if emoji:
|
|
return emoji
|
|
|
|
try:
|
|
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
|
|
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
|
|
# print("OK : %s --> %s%s" % (locale, c1, c2))
|
|
except KeyError as exc:
|
|
print("ERROR: %s --> %s" % (locale, exc))
|
|
return None
|
|
|
|
return c1 + c2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|