From dba8977b098b7f32dde78b8d7c27c5df50aacecb Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 4 Oct 2022 19:20:32 +0200 Subject: [PATCH] [mod] DuckDuckGo: fetch engine traits (data_type: supported_languages) Implements a fetch_traits function for the DuckDuckGo engines. .. note:: Does not include migration of the request methode from 'supported_languages' to 'traits' (EngineTraits) object! Signed-off-by: Markus Heiser --- searx/data/engine_traits.json | 198 +++++++++++++++++++++++- searx/engines/duckduckgo.py | 79 +++++++++- searx/engines/duckduckgo_definitions.py | 1 + searx/engines/duckduckgo_images.py | 1 + 4 files changed, 266 insertions(+), 13 deletions(-) diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json index 27b665cbb..251b7295a 100644 --- a/searx/data/engine_traits.json +++ b/searx/data/engine_traits.json @@ -2124,11 +2124,73 @@ ] }, "ddg definitions": { - "all_locale": null, + "all_locale": "wt-wt", "custom": {}, "data_type": "supported_languages", "languages": {}, - "regions": {}, + "regions": { + "ar-SA": "xa-ar", + "bg-BG": "bg-bg", + "ca-ES": "es-ca", + "cs-CZ": "cz-cs", + "da-DK": "dk-da", + "de-AT": "at-de", + "de-CH": "ch-de", + "de-DE": "de-de", + "el-GR": "gr-el", + "en-AU": "au-en", + "en-CA": "ca-en", + "en-GB": "uk-en", + "en-IE": "ie-en", + "en-IL": "il-en", + "en-IN": "in-en", + "en-MY": "my-en", + "en-NZ": "nz-en", + "en-PH": "ph-en", + "en-PK": "pk-en", + "en-SG": "sg-en", + "en-US": "us-en", + "en-ZA": "za-en", + "es-AR": "ar-es", + "es-CL": "cl-es", + "es-CO": "co-es", + "es-ES": "es-es", + "es-MX": "mx-es", + "es-PE": "pe-es", + "es-US": "us-es", + "et-EE": "ee-et", + "fi-FI": "fi-fi", + "fr-BE": "be-fr", + "fr-CA": "ca-fr", + "fr-CH": "ch-fr", + "fr-FR": "fr-fr", + "hr-HR": "hr-hr", + "hu-HU": "hu-hu", + "id-ID": "id-en", + "it-IT": "it-it", + "ja-JP": "jp-jp", + "ko-KR": "kr-kr", + "lt-LT": "lt-lt", + "lv-LV": "lv-lv", + "nb-NO": "no-no", + "nl-BE": "be-nl", + "nl-NL": "nl-nl", + "pl-PL": "pl-pl", + "pt-BR": "br-pt", + "pt-PT": "pt-pt", + "ro-RO": "ro-ro", + "ru-RU": "ru-ru", + "sk-SK": "sk-sk", + "sl-SI": "sl-sl", + "sv-SE": "se-sv", + "th-TH": "th-en", + "tr-TR": "tr-tr", + "uk-UA": "ua-uk", + "vi-VN": "vn-en", + "zh-CN": "cn-zh", + "zh-HK": "hk-tzh", + "zh-TW": "tw-tzh" + }, "supported_languages": [ "ar-XA", "bg-BG", @@ -2196,11 +2258,73 @@ ] }, "duckduckgo": { - "all_locale": null, + "all_locale": "wt-wt", "custom": {}, "data_type": "supported_languages", "languages": {}, - "regions": {}, + "regions": { + "ar-SA": "xa-ar", + "bg-BG": "bg-bg", + "ca-ES": "es-ca", + "cs-CZ": "cz-cs", + "da-DK": "dk-da", + "de-AT": "at-de", + "de-CH": "ch-de", + "de-DE": "de-de", + "el-GR": "gr-el", + "en-AU": "au-en", + "en-CA": "ca-en", + "en-GB": "uk-en", + "en-IE": "ie-en", + "en-IL": "il-en", + "en-IN": "in-en", + "en-MY": "my-en", + "en-NZ": "nz-en", + "en-PH": "ph-en", + "en-PK": "pk-en", + "en-SG": "sg-en", + "en-US": "us-en", + "en-ZA": "za-en", + "es-AR": "ar-es", + "es-CL": "cl-es", + "es-CO": "co-es", + "es-ES": "es-es", + "es-MX": "mx-es", + "es-PE": "pe-es", + "es-US": "us-es", + "et-EE": "ee-et", + "fi-FI": "fi-fi", + "fr-BE": "be-fr", + "fr-CA": "ca-fr", + "fr-CH": "ch-fr", + "fr-FR": "fr-fr", + "hr-HR": "hr-hr", + "hu-HU": "hu-hu", + "id-ID": "id-en", + "it-IT": "it-it", + "ja-JP": "jp-jp", + "ko-KR": "kr-kr", + "lt-LT": "lt-lt", + "lv-LV": "lv-lv", + "nb-NO": "no-no", + "nl-BE": "be-nl", + "nl-NL": "nl-nl", + "pl-PL": "pl-pl", + "pt-BR": "br-pt", + "pt-PT": "pt-pt", + "ro-RO": "ro-ro", + "ru-RU": "ru-ru", + "sk-SK": "sk-sk", + "sl-SI": "sl-sl", + "sv-SE": "se-sv", + "th-TH": "th-en", + "tr-TR": "tr-tr", + "uk-UA": "ua-uk", + "vi-VN": "vn-en", + "zh-CN": "cn-zh", + "zh-HK": "hk-tzh", + "zh-TW": "tw-tzh" + }, "supported_languages": [ "ar-XA", "bg-BG", @@ -2268,11 +2392,73 @@ ] }, "duckduckgo images": { - "all_locale": null, + "all_locale": "wt-wt", "custom": {}, "data_type": "supported_languages", "languages": {}, - "regions": {}, + "regions": { + "ar-SA": "xa-ar", + "bg-BG": "bg-bg", + "ca-ES": "es-ca", + "cs-CZ": "cz-cs", + "da-DK": "dk-da", + "de-AT": "at-de", + "de-CH": "ch-de", + "de-DE": "de-de", + "el-GR": "gr-el", + "en-AU": "au-en", + "en-CA": "ca-en", + "en-GB": "uk-en", + "en-IE": "ie-en", + "en-IL": "il-en", + "en-IN": "in-en", + "en-MY": "my-en", + "en-NZ": "nz-en", + "en-PH": "ph-en", + "en-PK": "pk-en", + "en-SG": "sg-en", + "en-US": "us-en", + "en-ZA": "za-en", + "es-AR": "ar-es", + "es-CL": "cl-es", + "es-CO": "co-es", + "es-ES": "es-es", + "es-MX": "mx-es", + "es-PE": "pe-es", + "es-US": "us-es", + "et-EE": "ee-et", + "fi-FI": "fi-fi", + "fr-BE": "be-fr", + "fr-CA": "ca-fr", + "fr-CH": "ch-fr", + "fr-FR": "fr-fr", + "hr-HR": "hr-hr", + "hu-HU": "hu-hu", + "id-ID": "id-en", + "it-IT": "it-it", + "ja-JP": "jp-jp", + "ko-KR": "kr-kr", + "lt-LT": "lt-lt", + "lv-LV": "lv-lv", + "nb-NO": "no-no", + "nl-BE": "be-nl", + "nl-NL": "nl-nl", + "pl-PL": "pl-pl", + "pt-BR": "br-pt", + "pt-PT": "pt-pt", + "ro-RO": "ro-ro", + "ru-RU": "ru-ru", + "sk-SK": "sk-sk", + "sl-SI": "sl-sl", + "sv-SE": "se-sv", + "th-TH": "th-en", + "tr-TR": "tr-tr", + "uk-UA": "ua-uk", + "vi-VN": "vn-en", + "zh-CN": "cn-zh", + "zh-HK": "hk-tzh", + "zh-TW": "tw-tzh" + }, "supported_languages": [ "ar-XA", "bg-BG", diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 2a7956ca8..cb47122ae 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -3,9 +3,8 @@ """DuckDuckGo Lite """ -from json import loads - -from lxml.html import fromstring +import json +from lxml import html from searx.utils import ( dict_subset, @@ -14,7 +13,10 @@ from searx.utils import ( extract_text, match_language, ) -from searx.network import get +from searx import network +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits # about about = { @@ -120,13 +122,13 @@ def request(query, params): def response(resp): headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) - get(url_ping, headers=headers_ping) + network.get(url_ping, headers=headers_ping) if resp.status_code == 303: return [] results = [] - doc = fromstring(resp.text) + doc = html.fromstring(resp.text) result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') if not len(result_table) >= 3: @@ -180,7 +182,70 @@ def _fetch_supported_languages(resp): response_page = response_page[response_page.find('regions:{') + 8 :] response_page = response_page[: response_page.find('}') + 1] - regions_json = loads(response_page) + regions_json = json.loads(response_page) supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) return list(supported_languages) + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch regions from DuckDuckGo.""" + # pylint: disable=import-outside-toplevel + + engine_traits.data_type = 'supported_languages' # deprecated + + import babel + from searx.locales import region_tag + + engine_traits.all_locale = 'wt-wt' + + resp = network.get('https://duckduckgo.com/util/u588.js') + if not resp.ok: + print("ERROR: response from DuckDuckGo is not OK.") + + pos = resp.text.find('regions:{') + 8 + js_code = resp.text[pos:] + pos = js_code.find('}') + 1 + regions = json.loads(js_code[:pos]) + + reg_map = { + 'tw-tzh': 'zh_TW', + 'hk-tzh': 'zh_HK', + 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES + 'es-ca': 'ca_ES', + 'id-en': 'id_ID', + 'no-no': 'nb_NO', + 'jp-jp': 'ja_JP', + 'kr-kr': 'ko_KR', + 'xa-ar': 'ar_SA', + 'sl-sl': 'sl_SI', + 'th-en': 'th_TH', + 'vn-en': 'vi_VN', + } + + for eng_tag, name in regions.items(): + + if eng_tag == 'wt-wt': + engine_traits.all_locale = 'wt-wt' + continue + + region = reg_map.get(eng_tag) + if region == 'skip': + continue + + if not region: + eng_territory, eng_lang = eng_tag.split('-') + region = eng_lang + '_' + eng_territory.upper() + + try: + sxng_tag = region_tag(babel.Locale.parse(region)) + except babel.UnknownLocaleError: + print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region)) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 7ed0de35c..8b42799be 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -11,6 +11,7 @@ from lxml import html from searx.data import WIKIDATA_UNITS from searx.engines.duckduckgo import language_aliases from searx.engines.duckduckgo import ( # pylint: disable=unused-import + fetch_traits, _fetch_supported_languages, supported_languages_url, ) diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 19f649ef4..927bc6cff 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -8,6 +8,7 @@ from urllib.parse import urlencode from searx.exceptions import SearxEngineAPIException from searx.engines.duckduckgo import get_region_code from searx.engines.duckduckgo import ( # pylint: disable=unused-import + fetch_traits, _fetch_supported_languages, supported_languages_url, )