#!/usr/bin/env python # SPDX-License-Identifier: AGPL-3.0-or-later """Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py` :py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`: Persistence of engines traits, fetched from the engines. :origin:`searx/languages.py` Is generated from intersecting each engine's supported traits. The script :origin:`searxng_extra/update/update_engine_traits.py` is called in the :origin:`CI Update data ... <.github/workflows/data-update.yml>` """ # pylint: disable=invalid-name from unicodedata import lookup from pathlib import Path from pprint import pformat import babel from searx import settings, searx_dir from searx import network from searx.engines import load_engines from searx.enginelib.traits import EngineTraitsMap # Output files. languages_file = Path(searx_dir) / 'sxng_locales.py' languages_file_header = """\ # SPDX-License-Identifier: AGPL-3.0-or-later '''List of SearXNG's locale codes. .. hint:: Don't modify this file, this file is generated by:: ./manage data.traits ''' sxng_locales = ( """ languages_file_footer = """, ) ''' A list of five-digit tuples: 0. SearXNG's internal locale tag (a language or region tag) 1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`) 2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`). Empty string for language tags. 3. English language name (from :py:obj:`babel.core.Locale.english_name`) 4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages are represented by a globe (\U0001F310) .. code:: python ('en', 'English', '', 'English', '\U0001f310'), ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'), ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), .. ('fr', 'Français', '', 'French', '\U0001f310'), ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), :meta hide-value: ''' """ lang2emoji = { 'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger 'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina 'jp': '\U0001F1EF\U0001F1F5', # Japanese 'ua': '\U0001F1FA\U0001F1E6', # Ukrainian 'he': '\U0001F1EE\U0001F1F1', # Hebrew } def main(): load_engines(settings['engines']) # traits_map = EngineTraitsMap.from_data() traits_map = fetch_traits_map() sxng_tag_list = filter_locales(traits_map) write_languages_file(sxng_tag_list) def fetch_traits_map(): """Fetchs supported languages for each engine and writes json file with those.""" network.set_timeout_for_thread(10.0) def log(msg): print(msg) traits_map = EngineTraitsMap.fetch_traits(log=log) print("fetched properties from %s engines" % len(traits_map)) print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) traits_map.save_data() return traits_map def filter_locales(traits_map: EngineTraitsMap): """Filter language & region tags by a threshold.""" min_eng_per_region = 15 min_eng_per_lang = 20 _ = {} for eng in traits_map.values(): for reg in eng.regions.keys(): _[reg] = _.get(reg, 0) + 1 regions = set(k for k, v in _.items() if v >= min_eng_per_region) lang_from_region = set(k.split('-')[0] for k in regions) _ = {} for eng in traits_map.values(): for lang in eng.languages.keys(): # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they # already counted by existence of 'zh' or 'sr', 'pa') if '_' in lang: # print("ignore %s" % lang) continue _[lang] = _.get(lang, 0) + 1 languages = set(k for k, v in _.items() if v >= min_eng_per_lang) sxng_tag_list = set() sxng_tag_list.update(regions) sxng_tag_list.update(lang_from_region) sxng_tag_list.update(languages) return sxng_tag_list def write_languages_file(sxng_tag_list): language_codes = [] for sxng_tag in sorted(sxng_tag_list): sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-') flag = get_unicode_flag(sxng_locale) or '' item = ( sxng_tag, sxng_locale.get_language_name().title(), # type: ignore sxng_locale.get_territory_name() or '', sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '', UnicodeEscape(flag), ) language_codes.append(item) language_codes = tuple(language_codes) with languages_file.open('w', encoding='utf-8') as new_file: file_content = "{header} {language_codes}{footer}".format( header=languages_file_header, language_codes=pformat(language_codes, width=120, indent=4)[1:-1], footer=languages_file_footer, ) new_file.write(file_content) new_file.close() class UnicodeEscape(str): """Escape unicode string in :py:obj:`pprint.pformat`""" def __repr__(self): return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" def get_unicode_flag(locale: babel.Locale): """Determine a unicode flag (emoji) that fits to the ``locale``""" emoji = lang2emoji.get(locale.language) if emoji: return emoji if not locale.territory: return '\U0001F310' emoji = lang2emoji.get(locale.territory.lower()) if emoji: return emoji try: c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0]) c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1]) # print("OK : %s --> %s%s" % (locale, c1, c2)) except KeyError as exc: print("ERROR: %s --> %s" % (locale, exc)) return None return c1 + c2 if __name__ == "__main__": main()