[enh] engine: mojeek - add language support

Improve region and language detection / all locale

Testing has shown the following behaviour for the different
default and empty values of Mojeeks parameters:

| param    | idx | value  | behaviour                 |
| -------- | --- | ------ | ------------------------- |
| region   |  0  | ''     | detect region based on IP |
| region   |  1  | 'none' | all regions               |
| language |  0  | ''     | all languages             |
This commit is contained in:
0xhtml 2024-09-29 17:27:29 +02:00 committed by Markus Heiser
parent 5b6f40414a
commit 8b6a3f3e11
3 changed files with 49 additions and 5 deletions

View file

@ -1,12 +1,15 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Mojeek (general, images, news)""" """Mojeek (general, images, news)"""
from typing import TYPE_CHECKING
from datetime import datetime from datetime import datetime
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from searx.utils import eval_xpath, eval_xpath_list, extract_text from searx.utils import eval_xpath, eval_xpath_list, extract_text
from searx.enginelib.traits import EngineTraits
about = { about = {
'website': 'https://mojeek.com', 'website': 'https://mojeek.com',
@ -42,6 +45,18 @@ news_url_xpath = './/h2/a/@href'
news_title_xpath = './/h2/a' news_title_xpath = './/h2/a'
news_content_xpath = './/p[@class="s"]' news_content_xpath = './/p[@class="s"]'
language_param = 'lb'
region_param = 'arc'
_delta_kwargs = {'day': 'days', 'week': 'weeks', 'month': 'months', 'year': 'years'}
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
traits: EngineTraits
def init(_): def init(_):
if search_type not in ('', 'images', 'news'): if search_type not in ('', 'images', 'news'):
@ -53,13 +68,16 @@ def request(query, params):
'q': query, 'q': query,
'safe': min(params['safesearch'], 1), 'safe': min(params['safesearch'], 1),
'fmt': search_type, 'fmt': search_type,
language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']),
region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']),
} }
if search_type == '': if search_type == '':
args['s'] = 10 * (params['pageno'] - 1) args['s'] = 10 * (params['pageno'] - 1)
if params['time_range'] and search_type != 'images': if params['time_range'] and search_type != 'images':
args["since"] = (datetime.now() - relativedelta(**{f"{params['time_range']}s": 1})).strftime("%Y%m%d") kwargs = {_delta_kwargs[params['time_range']]: 1}
args["since"] = (datetime.now() - relativedelta(**kwargs)).strftime("%Y%m%d") # type: ignore
logger.debug(args["since"]) logger.debug(args["since"])
params['url'] = f"{base_url}/search?{urlencode(args)}" params['url'] = f"{base_url}/search?{urlencode(args)}"
@ -94,7 +112,7 @@ def _image_results(dom):
'template': 'images.html', 'template': 'images.html',
'url': extract_text(eval_xpath(result, image_url_xpath)), 'url': extract_text(eval_xpath(result, image_url_xpath)),
'title': extract_text(eval_xpath(result, image_title_xpath)), 'title': extract_text(eval_xpath(result, image_title_xpath)),
'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)), 'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)), # type: ignore
'content': '', 'content': '',
} }
) )
@ -130,3 +148,31 @@ def response(resp):
return _news_results(dom) return _news_results(dom)
raise ValueError(f"Invalid search type {search_type}") raise ValueError(f"Invalid search type {search_type}")
def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from searx.locales import get_official_locales, region_tag
from babel import Locale, UnknownLocaleError
import contextlib
resp = network.get(base_url + "/preferences", headers={'Accept-Language': 'en-US,en;q=0.5'})
dom = html.fromstring(resp.text) # type: ignore
languages = eval_xpath_list(dom, f'//select[@name="{language_param}"]/option/@value')
engine_traits.custom['language_all'] = languages[0]
for code in languages[1:]:
with contextlib.suppress(UnknownLocaleError):
locale = Locale(code)
engine_traits.languages[locale.language] = code
regions = eval_xpath_list(dom, f'//select[@name="{region_param}"]/option/@value')
engine_traits.custom['region_all'] = regions[1]
for code in regions[2:]:
for locale in get_official_locales(code, engine_traits.languages):
engine_traits.regions[region_tag(locale)] = code

View file

@ -101,7 +101,7 @@ def fetch_traits_map():
def filter_locales(traits_map: EngineTraitsMap): def filter_locales(traits_map: EngineTraitsMap):
"""Filter language & region tags by a threshold.""" """Filter language & region tags by a threshold."""
min_eng_per_region = 15 min_eng_per_region = 18
min_eng_per_lang = 20 min_eng_per_lang = 20
_ = {} _ = {}

View file

@ -32,12 +32,10 @@ class TestLocales(SearxTestCase):
@parameterized.expand( @parameterized.expand(
[ [
('ca-es', 'ca-ES'),
('de-at', 'de-AT'), ('de-at', 'de-AT'),
('de-de', 'de-DE'), ('de-de', 'de-DE'),
('en-UK', 'en-GB'), ('en-UK', 'en-GB'),
('fr-be', 'fr-BE'), ('fr-be', 'fr-BE'),
('fr-be', 'fr-BE'),
('fr-ca', 'fr-CA'), ('fr-ca', 'fr-CA'),
('fr-ch', 'fr-CH'), ('fr-ch', 'fr-CH'),
('zh-cn', 'zh-CN'), ('zh-cn', 'zh-CN'),