From eafc2906f1ec6be52e89f5bd364093c5f1e66856 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 27 Jun 2023 16:17:17 +0200 Subject: [PATCH] [mod] engine: Anna's Archive - fetch search arguments from search form Signed-off-by: Markus Heiser --- searx/data/engine_traits.json | 132 ++++++++++++++++++++++++++++++++- searx/engines/annas_archive.py | 53 ++++++++++++- searx/settings.yml | 2 +- 3 files changed, 183 insertions(+), 4 deletions(-) diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json index aef8bae0b..072c9a5c4 100644 --- a/searx/data/engine_traits.json +++ b/searx/data/engine_traits.json @@ -1,4 +1,134 @@ { + "annas archive": { + "all_locale": "", + "custom": { + "content": [ + "", + "journal_article", + "book_any", + "book_fiction", + "book_unknown", + "book_nonfiction", + "book_comic", + "magazine", + "standards_document" + ], + "ext": [ + "", + "pdf", + "epub", + "cbr", + "fb2", + "mobi", + "cbz", + "djvu", + "azw3", + "fb2.zip", + "txt", + "rar", + "zip", + "doc", + "lit", + "rtf", + "htm", + "html", + "lrf", + "mht", + "docx" + ], + "sort": [ + "", + "newest", + "oldest", + "largest", + "smallest" + ] + }, + "data_type": "traits_v1", + "languages": { + "af": "af", + "ar": "ar", + "az": "az", + "be": "be", + "bg": "bg", + "bn": "bn", + "bo": "bo", + "bs": "bs", + "ca": "ca", + "cs": "cs", + "da": "da", + "de": "de", + "el": "el", + "en": "en", + "eo": "eo", + "es": "es", + "et": "et", + "eu": "eu", + "fa": "fa", + "fi": "fi", + "fil": "tl", + "fr": "fr", + "gl": "gl", + "gu": "gu", + "he": "he", + "hi": "hi", + "hr": "hr", + "hu": "hu", + "hy": "hy", + "id": "id", + "is": "is", + "it": "it", + "ja": "ja", + "ka": "ka", + "kk": "kk", + "kn": "kn", + "ko": "ko", + "ku": "ku", + "ky": "ky", + "lo": "lo", + "lt": "lt", + "lv": "lv", + "mk": "mk", + "ml": "ml", + "mn": "mn", + "mr": "mr", + "ms": "ms", + "my": "my", + "nb": "nb", + "ne": "ne", + "nl": "nl", + "no": "no", + "pa": "pa", + "pl": "pl", + "ps": "ps", + "pt": "pt", + "ro": "ro", + "ru": "ru", + "sa": "sa", + "sd": "sd", + "si": "si", + "sk": "sk", + "sl": "sl", + "so": "so", + "sq": "sq", + "sr": "sr", + "sv": "sv", + "sw": "sw", + "ta": "ta", + "te": "te", + "tg": "tg", + "tr": "tr", + "tt": "tt", + "ug": "ug", + "uk": "uk", + "ur": "ur", + "uz": "uz", + "vi": "vi", + "yi": "yi", + "zh": "zh" + }, + "regions": {} + }, "arch linux wiki": { "all_locale": null, "custom": { @@ -4127,4 +4257,4 @@ }, "regions": {} } -} \ No newline at end of file +} diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py index 1d5aa41ee..c845d67c6 100644 --- a/searx/engines/annas_archive.py +++ b/searx/engines/annas_archive.py @@ -7,7 +7,8 @@ from typing import List, Dict, Any, Optional from urllib.parse import quote from lxml import html -from searx.utils import extract_text, eval_xpath +from searx.utils import extract_text, eval_xpath, eval_xpath_list +from searx.enginelib.traits import EngineTraits # about about: Dict[str, Any] = { @@ -42,7 +43,6 @@ def request(query, params: Dict[str, Any]) -> Dict[str, Any]: lang = params["language"] params["url"] = search_url.format(search_query=quote(query), lang=lang) - print(params) return params @@ -66,3 +66,52 @@ def response(resp) -> List[Dict[str, Optional[str]]]: results.append(result) return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and other search arguments from Anna's search form.""" + # pylint: disable=import-outside-toplevel + + import babel + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + from searx.locales import language_tag + + engine_traits.all_locale = '' + engine_traits.custom['content'] = [] + engine_traits.custom['ext'] = [] + engine_traits.custom['sort'] = [] + + resp = get(base_url + '/search') + if not resp.ok: # type: ignore + raise RuntimeError("Response from Anna's search page is not OK.") + dom = html.fromstring(resp.text) # type: ignore + + # supported language codes + + lang_map = {} + for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"): + eng_lang = x.get("value") + if eng_lang in ('', '_empty', 'nl-BE', 'und'): + continue + try: + locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') + except babel.UnknownLocaleError: + # silently ignore unknown languages + # print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) + continue + sxng_lang = language_tag(locale) + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = eng_lang + + for x in eval_xpath_list(dom, "//form//select[@name='content']//option"): + engine_traits.custom['content'].append(x.get("value")) + + for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"): + engine_traits.custom['ext'].append(x.get("value")) + + for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"): + engine_traits.custom['sort'].append(x.get("value")) diff --git a/searx/settings.yml b/searx/settings.yml index 8877fba54..e42373a82 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -297,7 +297,7 @@ engines: shortcut: 9g disabled: true - - name: anna's archive + - name: annas archive engine: annas_archive paging: false categories: files