mirror of
https://github.com/searxng/searxng.git
synced 2025-01-23 16:48:06 +00:00
Merge pull request #2269 from return42/locale-revision
Revision of the locale- and language- handling in SearXNG
This commit is contained in:
commit
f950119ca8
75 changed files with 7823 additions and 6414 deletions
2
.github/workflows/data-update.yml
vendored
2
.github/workflows/data-update.yml
vendored
|
@ -17,7 +17,7 @@ jobs:
|
|||
- update_currencies.py
|
||||
- update_external_bangs.py
|
||||
- update_firefox_version.py
|
||||
- update_languages.py
|
||||
- update_engine_traits.py
|
||||
- update_wikidata_units.py
|
||||
- update_engine_descriptions.py
|
||||
steps:
|
||||
|
|
|
@ -42,7 +42,7 @@ Explanation of the :ref:`general engine configuration` shown in the table
|
|||
- Timeout
|
||||
- Weight
|
||||
- Paging
|
||||
- Language
|
||||
- Language, Region
|
||||
- Safe search
|
||||
- Time range
|
||||
|
||||
|
|
|
@ -569,10 +569,13 @@ engine is shown. Most of the options have a default value or even are optional.
|
|||
To disable by default the engine, but not deleting it. It will allow the user
|
||||
to manually activate it in the settings.
|
||||
|
||||
``inactive``: optional
|
||||
Remove the engine from the settings (*disabled & removed*).
|
||||
|
||||
``language`` : optional
|
||||
If you want to use another language for a specific engine, you can define it
|
||||
by using the full ISO code of language and country, like ``fr_FR``, ``en_US``,
|
||||
``de_DE``.
|
||||
by using the ISO code of language (and region), like ``fr``, ``en-US``,
|
||||
``de-DE``.
|
||||
|
||||
``tokens`` : optional
|
||||
A list of secret tokens to make this engine *private*, more details see
|
||||
|
|
|
@ -127,6 +127,10 @@ extensions = [
|
|||
'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page
|
||||
]
|
||||
|
||||
autodoc_default_options = {
|
||||
'member-order': 'groupwise',
|
||||
}
|
||||
|
||||
myst_enable_extensions = [
|
||||
"replacements", "smartquotes"
|
||||
]
|
||||
|
@ -135,6 +139,7 @@ suppress_warnings = ['myst.domains']
|
|||
|
||||
intersphinx_mapping = {
|
||||
"python": ("https://docs.python.org/3/", None),
|
||||
"babel" : ("https://babel.readthedocs.io/en/latest/", None),
|
||||
"flask": ("https://flask.palletsprojects.com/", None),
|
||||
"flask_babel": ("https://python-babel.github.io/flask-babel/", None),
|
||||
# "werkzeug": ("https://werkzeug.palletsprojects.com/", None),
|
||||
|
|
|
@ -54,6 +54,7 @@ Engine File
|
|||
- ``offline`` :ref:`[ref] <offline engines>`
|
||||
- ``online_dictionary``
|
||||
- ``online_currency``
|
||||
- ``online_url_search``
|
||||
======================= =========== ========================================================
|
||||
|
||||
.. _engine settings:
|
||||
|
@ -131,8 +132,10 @@ Passed Arguments (request)
|
|||
These arguments can be used to construct the search query. Furthermore,
|
||||
parameters with default value can be redefined for special purposes.
|
||||
|
||||
.. _engine request online:
|
||||
|
||||
.. table:: If the ``engine_type`` is ``online``
|
||||
.. table:: If the ``engine_type`` is :py:obj:`online
|
||||
<searx.search.processors.online.OnlineProcessor.get_params>`
|
||||
:width: 100%
|
||||
|
||||
====================== ============== ========================================================================
|
||||
|
@ -149,12 +152,16 @@ parameters with default value can be redefined for special purposes.
|
|||
safesearch int ``0``, between ``0`` and ``2`` (normal, moderate, strict)
|
||||
time_range Optional[str] ``None``, can be ``day``, ``week``, ``month``, ``year``
|
||||
pageno int current pagenumber
|
||||
language str specific language code like ``'en_US'``, or ``'all'`` if unspecified
|
||||
searxng_locale str SearXNG's locale selected by user. Specific language code like
|
||||
``'en'``, ``'en-US'``, or ``'all'`` if unspecified.
|
||||
====================== ============== ========================================================================
|
||||
|
||||
|
||||
.. table:: If the ``engine_type`` is ``online_dictionary``, in addition to the
|
||||
``online`` arguments:
|
||||
.. _engine request online_dictionary:
|
||||
|
||||
.. table:: If the ``engine_type`` is :py:obj:`online_dictionary
|
||||
<searx.search.processors.online_dictionary.OnlineDictionaryProcessor.get_params>`,
|
||||
in addition to the :ref:`online <engine request online>` arguments:
|
||||
:width: 100%
|
||||
|
||||
====================== ============== ========================================================================
|
||||
|
@ -165,8 +172,11 @@ parameters with default value can be redefined for special purposes.
|
|||
query str the text query without the languages
|
||||
====================== ============== ========================================================================
|
||||
|
||||
.. table:: If the ``engine_type`` is ``online_currency```, in addition to the
|
||||
``online`` arguments:
|
||||
.. _engine request online_currency:
|
||||
|
||||
.. table:: If the ``engine_type`` is :py:obj:`online_currency
|
||||
<searx.search.processors.online_currency.OnlineCurrencyProcessor.get_params>`,
|
||||
in addition to the :ref:`online <engine request online>` arguments:
|
||||
:width: 100%
|
||||
|
||||
====================== ============== ========================================================================
|
||||
|
@ -179,6 +189,26 @@ parameters with default value can be redefined for special purposes.
|
|||
to_name str currency name
|
||||
====================== ============== ========================================================================
|
||||
|
||||
.. _engine request online_url_search:
|
||||
|
||||
.. table:: If the ``engine_type`` is :py:obj:`online_url_search
|
||||
<searx.search.processors.online_url_search.OnlineUrlSearchProcessor.get_params>`,
|
||||
in addition to the :ref:`online <engine request online>` arguments:
|
||||
:width: 100%
|
||||
|
||||
====================== ============== ========================================================================
|
||||
argument type default-value, information
|
||||
====================== ============== ========================================================================
|
||||
search_url dict URLs from the search query:
|
||||
|
||||
.. code:: python
|
||||
|
||||
{
|
||||
'http': str,
|
||||
'ftp': str,
|
||||
'data:image': str
|
||||
}
|
||||
====================== ============== ========================================================================
|
||||
|
||||
Specify Request
|
||||
---------------
|
||||
|
|
|
@ -52,12 +52,12 @@ Scripts to update static data in :origin:`searx/data/`
|
|||
:members:
|
||||
|
||||
|
||||
``update_languages.py``
|
||||
=======================
|
||||
``update_engine_traits.py``
|
||||
===========================
|
||||
|
||||
:origin:`[source] <searxng_extra/update/update_languages.py>`
|
||||
:origin:`[source] <searxng_extra/update/update_engine_traits.py>`
|
||||
|
||||
.. automodule:: searxng_extra.update.update_languages
|
||||
.. automodule:: searxng_extra.update.update_engine_traits
|
||||
:members:
|
||||
|
||||
|
||||
|
|
9
docs/src/searx.engine.archlinux.rst
Normal file
9
docs/src/searx.engine.archlinux.rst
Normal file
|
@ -0,0 +1,9 @@
|
|||
.. _archlinux engine:
|
||||
|
||||
==========
|
||||
Arch Linux
|
||||
==========
|
||||
|
||||
.. automodule:: searx.engines.archlinux
|
||||
:members:
|
||||
|
8
docs/src/searx.engine.dailymotion.rst
Normal file
8
docs/src/searx.engine.dailymotion.rst
Normal file
|
@ -0,0 +1,8 @@
|
|||
.. _dailymotion engine:
|
||||
|
||||
===========
|
||||
Dailymotion
|
||||
===========
|
||||
|
||||
.. automodule:: searx.engines.dailymotion
|
||||
:members:
|
22
docs/src/searx.engine.duckduckgo.rst
Normal file
22
docs/src/searx.engine.duckduckgo.rst
Normal file
|
@ -0,0 +1,22 @@
|
|||
.. _duckduckgo engines:
|
||||
|
||||
=================
|
||||
DukcDukGo engines
|
||||
=================
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
.. automodule:: searx.engines.duckduckgo
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.engines.duckduckgo_images
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.engines.duckduckgo_definitions
|
||||
:members:
|
||||
|
||||
.. automodule:: searx.engines.duckduckgo_weather
|
||||
:members:
|
17
docs/src/searx.enginelib.rst
Normal file
17
docs/src/searx.enginelib.rst
Normal file
|
@ -0,0 +1,17 @@
|
|||
.. _searx.enginelib:
|
||||
|
||||
============
|
||||
Engine model
|
||||
============
|
||||
|
||||
.. automodule:: searx.enginelib
|
||||
:members:
|
||||
|
||||
.. _searx.enginelib.traits:
|
||||
|
||||
=============
|
||||
Engine traits
|
||||
=============
|
||||
|
||||
.. automodule:: searx.enginelib.traits
|
||||
:members:
|
43
docs/src/searx.engines.bing.rst
Normal file
43
docs/src/searx.engines.bing.rst
Normal file
|
@ -0,0 +1,43 @@
|
|||
.. _bing engines:
|
||||
|
||||
============
|
||||
Bing Engines
|
||||
============
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
|
||||
.. _bing web engine:
|
||||
|
||||
Bing WEB
|
||||
========
|
||||
|
||||
.. automodule:: searx.engines.bing
|
||||
:members:
|
||||
|
||||
.. _bing images engine:
|
||||
|
||||
Bing Images
|
||||
===========
|
||||
|
||||
.. automodule:: searx.engines.bing_images
|
||||
:members:
|
||||
|
||||
.. _bing videos engine:
|
||||
|
||||
Bing Videos
|
||||
===========
|
||||
|
||||
.. automodule:: searx.engines.bing_videos
|
||||
:members:
|
||||
|
||||
.. _bing news engine:
|
||||
|
||||
Bing News
|
||||
=========
|
||||
|
||||
.. automodule:: searx.engines.bing_news
|
||||
:members:
|
|
@ -12,15 +12,21 @@ Google Engines
|
|||
|
||||
.. _google API:
|
||||
|
||||
google API
|
||||
Google API
|
||||
==========
|
||||
|
||||
.. _Query Parameter Definitions:
|
||||
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
|
||||
|
||||
SearXNG's implementation of the Google API is mainly done in
|
||||
:py:obj:`get_google_info <searx.engines.google.get_google_info>`.
|
||||
|
||||
For detailed description of the *REST-full* API see: `Query Parameter
|
||||
Definitions`_. Not all parameters can be appied and some engines are *special*
|
||||
(e.g. :ref:`google news engine`).
|
||||
Definitions`_. The linked API documentation can sometimes be helpful during
|
||||
reverse engineering. However, we cannot use it in the freely accessible WEB
|
||||
services; not all parameters can be applied and some engines are more *special*
|
||||
than other (e.g. :ref:`google news engine`).
|
||||
|
||||
|
||||
.. _google web engine:
|
||||
|
||||
|
@ -30,6 +36,13 @@ Google WEB
|
|||
.. automodule:: searx.engines.google
|
||||
:members:
|
||||
|
||||
.. _google autocomplete:
|
||||
|
||||
Google Autocomplete
|
||||
====================
|
||||
|
||||
.. autofunction:: searx.autocomplete.google_complete
|
||||
|
||||
.. _google images engine:
|
||||
|
||||
Google Images
|
||||
|
@ -53,3 +66,11 @@ Google News
|
|||
|
||||
.. automodule:: searx.engines.google_news
|
||||
:members:
|
||||
|
||||
.. _google scholar engine:
|
||||
|
||||
Google Scholar
|
||||
==============
|
||||
|
||||
.. automodule:: searx.engines.google_scholar
|
||||
:members:
|
||||
|
|
27
docs/src/searx.engines.peertube.rst
Normal file
27
docs/src/searx.engines.peertube.rst
Normal file
|
@ -0,0 +1,27 @@
|
|||
.. _peertube engines:
|
||||
|
||||
================
|
||||
Peertube Engines
|
||||
================
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
|
||||
.. _peertube video engine:
|
||||
|
||||
Peertube Video
|
||||
==============
|
||||
|
||||
.. automodule:: searx.engines.peertube
|
||||
:members:
|
||||
|
||||
.. _sepiasearch engine:
|
||||
|
||||
SepiaSearch
|
||||
===========
|
||||
|
||||
.. automodule:: searx.engines.sepiasearch
|
||||
:members:
|
|
@ -1,8 +1,8 @@
|
|||
.. _load_engines:
|
||||
.. _searx.engines:
|
||||
|
||||
============
|
||||
Load Engines
|
||||
============
|
||||
=================
|
||||
SearXNG's engines
|
||||
=================
|
||||
|
||||
.. automodule:: searx.engines
|
||||
:members:
|
||||
|
|
13
docs/src/searx.engines.startpage.rst
Normal file
13
docs/src/searx.engines.startpage.rst
Normal file
|
@ -0,0 +1,13 @@
|
|||
.. _startpage engines:
|
||||
|
||||
=================
|
||||
Startpage engines
|
||||
=================
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
.. automodule:: searx.engines.startpage
|
||||
:members:
|
27
docs/src/searx.engines.wikipedia.rst
Normal file
27
docs/src/searx.engines.wikipedia.rst
Normal file
|
@ -0,0 +1,27 @@
|
|||
.. _wikimedia engines:
|
||||
|
||||
=========
|
||||
Wikimedia
|
||||
=========
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
|
||||
.. _wikipedia engine:
|
||||
|
||||
Wikipedia
|
||||
=========
|
||||
|
||||
.. automodule:: searx.engines.wikipedia
|
||||
:members:
|
||||
|
||||
.. _wikidata engine:
|
||||
|
||||
Wikidata
|
||||
=========
|
||||
|
||||
.. automodule:: searx.engines.wikidata
|
||||
:members:
|
|
@ -4,5 +4,17 @@
|
|||
Locales
|
||||
=======
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
.. automodule:: searx.locales
|
||||
:members:
|
||||
|
||||
|
||||
SearXNG's locale codes
|
||||
======================
|
||||
|
||||
.. automodule:: searx.sxng_locales
|
||||
:members:
|
||||
|
|
47
docs/src/searx.search.processors.rst
Normal file
47
docs/src/searx.search.processors.rst
Normal file
|
@ -0,0 +1,47 @@
|
|||
.. _searx.search.processors:
|
||||
|
||||
=================
|
||||
Search processors
|
||||
=================
|
||||
|
||||
.. contents:: Contents
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
|
||||
Abstract processor class
|
||||
========================
|
||||
|
||||
.. automodule:: searx.search.processors.abstract
|
||||
:members:
|
||||
|
||||
Offline processor
|
||||
=================
|
||||
|
||||
.. automodule:: searx.search.processors.offline
|
||||
:members:
|
||||
|
||||
Online processor
|
||||
================
|
||||
|
||||
.. automodule:: searx.search.processors.online
|
||||
:members:
|
||||
|
||||
Online currency processor
|
||||
=========================
|
||||
|
||||
.. automodule:: searx.search.processors.online_currency
|
||||
:members:
|
||||
|
||||
Online Dictionary processor
|
||||
===========================
|
||||
|
||||
.. automodule:: searx.search.processors.online_dictionary
|
||||
:members:
|
||||
|
||||
Online URL search processor
|
||||
===========================
|
||||
|
||||
.. automodule:: searx.search.processors.online_url_search
|
||||
:members:
|
2
manage
2
manage
|
@ -63,7 +63,7 @@ PYLINT_SEARXNG_DISABLE_OPTION="\
|
|||
I,C,R,\
|
||||
W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\
|
||||
E1136"
|
||||
PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_languages,language_aliases,logger,categories"
|
||||
PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="traits,supported_languages,language_aliases,logger,categories"
|
||||
PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc"
|
||||
|
||||
help() {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
certifi==2022.12.7
|
||||
babel==2.11.0
|
||||
babel==2.12.1
|
||||
flask-babel==3.0.1
|
||||
flask==2.2.3
|
||||
jinja2==3.1.2
|
||||
|
|
|
@ -5,20 +5,20 @@
|
|||
"""
|
||||
# pylint: disable=use-dict-literal
|
||||
|
||||
from json import loads
|
||||
import json
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import etree
|
||||
import lxml
|
||||
from httpx import HTTPError
|
||||
|
||||
from searx import settings
|
||||
from searx.data import ENGINES_LANGUAGES
|
||||
from searx.engines import (
|
||||
engines,
|
||||
google,
|
||||
)
|
||||
from searx.network import get as http_get
|
||||
from searx.exceptions import SearxEngineResponseException
|
||||
|
||||
# a fetch_supported_languages() for XPath engines isn't available right now
|
||||
# _brave = ENGINES_LANGUAGES['brave'].keys()
|
||||
|
||||
|
||||
def get(*args, **kwargs):
|
||||
if 'timeout' not in kwargs:
|
||||
|
@ -55,34 +55,58 @@ def dbpedia(query, _lang):
|
|||
results = []
|
||||
|
||||
if response.ok:
|
||||
dom = etree.fromstring(response.content)
|
||||
dom = lxml.etree.fromstring(response.content)
|
||||
results = dom.xpath('//Result/Label//text()')
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def duckduckgo(query, _lang):
|
||||
# duckduckgo autocompleter
|
||||
url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'
|
||||
def duckduckgo(query, sxng_locale):
|
||||
"""Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages"""
|
||||
|
||||
resp = loads(get(url.format(urlencode(dict(q=query)))).text)
|
||||
if len(resp) > 1:
|
||||
return resp[1]
|
||||
return []
|
||||
traits = engines['duckduckgo'].traits
|
||||
args = {
|
||||
'q': query,
|
||||
'kl': traits.get_region(sxng_locale, traits.all_locale),
|
||||
}
|
||||
|
||||
url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args)
|
||||
resp = get(url)
|
||||
|
||||
ret_val = []
|
||||
if resp.ok:
|
||||
j = resp.json()
|
||||
if len(j) > 1:
|
||||
ret_val = j[1]
|
||||
return ret_val
|
||||
|
||||
|
||||
def google(query, lang):
|
||||
# google autocompleter
|
||||
autocomplete_url = 'https://suggestqueries.google.com/complete/search?client=toolbar&'
|
||||
def google_complete(query, sxng_locale):
|
||||
"""Autocomplete from Google. Supports Google's languages and subdomains
|
||||
(:py:obj:`searx.engines.google.get_google_info`) by using the async REST
|
||||
API::
|
||||
|
||||
response = get(autocomplete_url + urlencode(dict(hl=lang, q=query)))
|
||||
https://{subdomain}/complete/search?{args}
|
||||
|
||||
"""
|
||||
|
||||
google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits)
|
||||
|
||||
url = 'https://{subdomain}/complete/search?{args}'
|
||||
args = urlencode(
|
||||
{
|
||||
'q': query,
|
||||
'client': 'gws-wiz',
|
||||
'hl': google_info['params']['hl'],
|
||||
}
|
||||
)
|
||||
results = []
|
||||
|
||||
if response.ok:
|
||||
dom = etree.fromstring(response.text)
|
||||
results = dom.xpath('//suggestion/@data')
|
||||
|
||||
resp = get(url.format(subdomain=google_info['subdomain'], args=args))
|
||||
if resp.ok:
|
||||
json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1]
|
||||
data = json.loads(json_txt)
|
||||
for item in data[0]:
|
||||
results.append(lxml.html.fromstring(item[0]).text_content())
|
||||
return results
|
||||
|
||||
|
||||
|
@ -109,9 +133,9 @@ def seznam(query, _lang):
|
|||
]
|
||||
|
||||
|
||||
def startpage(query, lang):
|
||||
# startpage autocompleter
|
||||
lui = ENGINES_LANGUAGES['startpage'].get(lang, 'english')
|
||||
def startpage(query, sxng_locale):
|
||||
"""Autocomplete from Startpage. Supports Startpage's languages"""
|
||||
lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
|
||||
url = 'https://startpage.com/suggestions?{query}'
|
||||
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
|
||||
data = resp.json()
|
||||
|
@ -122,20 +146,20 @@ def swisscows(query, _lang):
|
|||
# swisscows autocompleter
|
||||
url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5'
|
||||
|
||||
resp = loads(get(url.format(query=urlencode({'query': query}))).text)
|
||||
resp = json.loads(get(url.format(query=urlencode({'query': query}))).text)
|
||||
return resp
|
||||
|
||||
|
||||
def qwant(query, lang):
|
||||
# qwant autocompleter (additional parameter : lang=en_en&count=xxx )
|
||||
url = 'https://api.qwant.com/api/suggest?{query}'
|
||||
|
||||
resp = get(url.format(query=urlencode({'q': query, 'lang': lang})))
|
||||
|
||||
def qwant(query, sxng_locale):
|
||||
"""Autocomplete from Qwant. Supports Qwant's regions."""
|
||||
results = []
|
||||
|
||||
locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US')
|
||||
url = 'https://api.qwant.com/v3/suggest?{query}'
|
||||
resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'})))
|
||||
|
||||
if resp.ok:
|
||||
data = loads(resp.text)
|
||||
data = resp.json()
|
||||
if data['status'] == 'success':
|
||||
for item in data['data']['items']:
|
||||
results.append(item['value'])
|
||||
|
@ -143,21 +167,38 @@ def qwant(query, lang):
|
|||
return results
|
||||
|
||||
|
||||
def wikipedia(query, lang):
|
||||
# wikipedia autocompleter
|
||||
url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'
|
||||
def wikipedia(query, sxng_locale):
|
||||
"""Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
|
||||
results = []
|
||||
eng_traits = engines['wikipedia'].traits
|
||||
wiki_lang = eng_traits.get_language(sxng_locale, 'en')
|
||||
wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
|
||||
|
||||
resp = loads(get(url.format(urlencode(dict(search=query)))).text)
|
||||
if len(resp) > 1:
|
||||
return resp[1]
|
||||
return []
|
||||
url = 'https://{wiki_netloc}/w/api.php?{args}'
|
||||
args = urlencode(
|
||||
{
|
||||
'action': 'opensearch',
|
||||
'format': 'json',
|
||||
'formatversion': '2',
|
||||
'search': query,
|
||||
'namespace': '0',
|
||||
'limit': '10',
|
||||
}
|
||||
)
|
||||
resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
|
||||
if resp.ok:
|
||||
data = resp.json()
|
||||
if len(data) > 1:
|
||||
results = data[1]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def yandex(query, _lang):
|
||||
# yandex autocompleter
|
||||
url = "https://suggest.yandex.com/suggest-ff.cgi?{0}"
|
||||
|
||||
resp = loads(get(url.format(urlencode(dict(part=query)))).text)
|
||||
resp = json.loads(get(url.format(urlencode(dict(part=query)))).text)
|
||||
if len(resp) > 1:
|
||||
return resp[1]
|
||||
return []
|
||||
|
@ -166,7 +207,7 @@ def yandex(query, _lang):
|
|||
backends = {
|
||||
'dbpedia': dbpedia,
|
||||
'duckduckgo': duckduckgo,
|
||||
'google': google,
|
||||
'google': google_complete,
|
||||
'seznam': seznam,
|
||||
'startpage': startpage,
|
||||
'swisscows': swisscows,
|
||||
|
@ -177,12 +218,11 @@ backends = {
|
|||
}
|
||||
|
||||
|
||||
def search_autocomplete(backend_name, query, lang):
|
||||
def search_autocomplete(backend_name, query, sxng_locale):
|
||||
backend = backends.get(backend_name)
|
||||
if backend is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
return backend(query, lang)
|
||||
return backend(query, sxng_locale)
|
||||
except (HTTPError, SearxEngineResponseException):
|
||||
return []
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"""
|
||||
|
||||
__all__ = [
|
||||
'ENGINES_LANGUAGES',
|
||||
'ENGINE_TRAITS',
|
||||
'CURRENCIES',
|
||||
'USER_AGENTS',
|
||||
'EXTERNAL_URLS',
|
||||
|
@ -42,7 +42,6 @@ def ahmia_blacklist_loader():
|
|||
return f.read().split()
|
||||
|
||||
|
||||
ENGINES_LANGUAGES = _load('engines_languages.json')
|
||||
CURRENCIES = _load('currencies.json')
|
||||
USER_AGENTS = _load('useragents.json')
|
||||
EXTERNAL_URLS = _load('external_urls.json')
|
||||
|
@ -50,3 +49,4 @@ WIKIDATA_UNITS = _load('wikidata_units.json')
|
|||
EXTERNAL_BANGS = _load('external_bangs.json')
|
||||
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
|
||||
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
|
||||
ENGINE_TRAITS = _load('engine_traits.json')
|
||||
|
|
3810
searx/data/engine_traits.json
Normal file
3810
searx/data/engine_traits.json
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
136
searx/enginelib/__init__.py
Normal file
136
searx/enginelib/__init__.py
Normal file
|
@ -0,0 +1,136 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Engine related implementations
|
||||
|
||||
.. note::
|
||||
|
||||
The long term goal is to modularize all relevant implementations to the
|
||||
engines here in this Python package. In addition to improved modularization,
|
||||
this will also be necessary in part because the probability of circular
|
||||
imports will increase due to the increased typification of implementations in
|
||||
the future.
|
||||
|
||||
ToDo:
|
||||
|
||||
- move :py:obj:`searx.engines.load_engine` to a new module `searx.enginelib`.
|
||||
"""
|
||||
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Union, Dict, List, Callable, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from searx.enginelib import traits
|
||||
|
||||
|
||||
class Engine: # pylint: disable=too-few-public-methods
|
||||
"""Class of engine instances build from YAML settings.
|
||||
|
||||
Further documentation see :ref:`general engine configuration`.
|
||||
|
||||
.. hint::
|
||||
|
||||
This class is currently never initialized and only used for type hinting.
|
||||
"""
|
||||
|
||||
# Common options in the engine module
|
||||
|
||||
engine_type: str
|
||||
"""Type of the engine (:origin:`searx/search/processors`)"""
|
||||
|
||||
paging: bool
|
||||
"""Engine supports multiple pages."""
|
||||
|
||||
time_range_support: bool
|
||||
"""Engine supports search time range."""
|
||||
|
||||
safesearch: bool
|
||||
"""Engine supports SafeSearch"""
|
||||
|
||||
language_support: bool
|
||||
"""Engine supports languages (locales) search."""
|
||||
|
||||
language: str
|
||||
"""For an engine, when there is ``language: ...`` in the YAML settings the engine
|
||||
does support only this one language:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: google french
|
||||
engine: google
|
||||
language: fr
|
||||
"""
|
||||
|
||||
region: str
|
||||
"""For an engine, when there is ``region: ...`` in the YAML settings the engine
|
||||
does support only this one region::
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: google belgium
|
||||
engine: google
|
||||
region: fr-BE
|
||||
"""
|
||||
|
||||
fetch_traits: Callable
|
||||
"""Function to to fetch engine's traits from origin."""
|
||||
|
||||
traits: traits.EngineTraits
|
||||
"""Traits of the engine."""
|
||||
|
||||
# settings.yml
|
||||
|
||||
categories: List[str]
|
||||
"""Tabs, in which the engine is working."""
|
||||
|
||||
name: str
|
||||
"""Name that will be used across SearXNG to define this engine. In settings, on
|
||||
the result page .."""
|
||||
|
||||
engine: str
|
||||
"""Name of the python file used to handle requests and responses to and from
|
||||
this search engine (file name from :origin:`searx/engines` without
|
||||
``.py``)."""
|
||||
|
||||
enable_http: bool
|
||||
"""Enable HTTP (by default only HTTPS is enabled)."""
|
||||
|
||||
shortcut: str
|
||||
"""Code used to execute bang requests (``!foo``)"""
|
||||
|
||||
timeout: float
|
||||
"""Specific timeout for search-engine."""
|
||||
|
||||
display_error_messages: bool
|
||||
"""Display error messages on the web UI."""
|
||||
|
||||
proxies: dict
|
||||
"""Set proxies for a specific engine (YAML):
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
proxies :
|
||||
http: socks5://proxy:port
|
||||
https: socks5://proxy:port
|
||||
"""
|
||||
|
||||
disabled: bool
|
||||
"""To disable by default the engine, but not deleting it. It will allow the
|
||||
user to manually activate it in the settings."""
|
||||
|
||||
inactive: bool
|
||||
"""Remove the engine from the settings (*disabled & removed*)."""
|
||||
|
||||
about: dict
|
||||
"""Additional fileds describing the engine.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
about:
|
||||
website: https://example.com
|
||||
wikidata_id: Q306656
|
||||
official_api_documentation: https://example.com/api-doc
|
||||
use_official_api: true
|
||||
require_api_key: true
|
||||
results: HTML
|
||||
"""
|
250
searx/enginelib/traits.py
Normal file
250
searx/enginelib/traits.py
Normal file
|
@ -0,0 +1,250 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Engine's traits are fetched from the origin engines and stored in a JSON file
|
||||
in the *data folder*. Most often traits are languages and region codes and
|
||||
their mapping from SearXNG's representation to the representation in the origin
|
||||
search engine. For new traits new properties can be added to the class
|
||||
:py:class:`EngineTraits`.
|
||||
|
||||
To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be
|
||||
used.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import dataclasses
|
||||
from typing import Dict, Union, Callable, Optional, TYPE_CHECKING
|
||||
from typing_extensions import Literal, Self
|
||||
|
||||
from searx import locales
|
||||
from searx.data import data_dir, ENGINE_TRAITS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from . import Engine
|
||||
|
||||
|
||||
class EngineTraitsEncoder(json.JSONEncoder):
|
||||
"""Encodes :class:`EngineTraits` to a serializable object, see
|
||||
:class:`json.JSONEncoder`."""
|
||||
|
||||
def default(self, o):
|
||||
"""Return dictionary of a :class:`EngineTraits` object."""
|
||||
if isinstance(o, EngineTraits):
|
||||
return o.__dict__
|
||||
return super().default(o)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class EngineTraits:
|
||||
"""The class is intended to be instantiated for each engine."""
|
||||
|
||||
regions: Dict[str, str] = dataclasses.field(default_factory=dict)
|
||||
"""Maps SearXNG's internal representation of a region to the one of the engine.
|
||||
|
||||
SearXNG's internal representation can be parsed by babel and the value is
|
||||
send to the engine:
|
||||
|
||||
.. code:: python
|
||||
|
||||
regions ={
|
||||
'fr-BE' : <engine's region name>,
|
||||
}
|
||||
|
||||
for key, egnine_region regions.items():
|
||||
searxng_region = babel.Locale.parse(key, sep='-')
|
||||
...
|
||||
"""
|
||||
|
||||
languages: Dict[str, str] = dataclasses.field(default_factory=dict)
|
||||
"""Maps SearXNG's internal representation of a language to the one of the engine.
|
||||
|
||||
SearXNG's internal representation can be parsed by babel and the value is
|
||||
send to the engine:
|
||||
|
||||
.. code:: python
|
||||
|
||||
languages = {
|
||||
'ca' : <engine's language name>,
|
||||
}
|
||||
|
||||
for key, egnine_lang in languages.items():
|
||||
searxng_lang = babel.Locale.parse(key)
|
||||
...
|
||||
"""
|
||||
|
||||
all_locale: Optional[str] = None
|
||||
"""To which locale value SearXNG's ``all`` language is mapped (shown a "Default
|
||||
language").
|
||||
"""
|
||||
|
||||
data_type: Literal['traits_v1'] = 'traits_v1'
|
||||
"""Data type, default is 'traits_v1'.
|
||||
"""
|
||||
|
||||
custom: Dict[str, Dict] = dataclasses.field(default_factory=dict)
|
||||
"""A place to store engine's custom traits, not related to the SearXNG core
|
||||
|
||||
"""
|
||||
|
||||
def get_language(self, searxng_locale: str, default=None):
|
||||
"""Return engine's language string that *best fits* to SearXNG's locale.
|
||||
|
||||
:param searxng_locale: SearXNG's internal representation of locale
|
||||
selected by the user.
|
||||
|
||||
:param default: engine's default language
|
||||
|
||||
The *best fits* rules are implemented in
|
||||
:py:obj:`locales.get_engine_locale`. Except for the special value ``all``
|
||||
which is determined from :py:obj`EngineTraits.all_language`.
|
||||
"""
|
||||
if searxng_locale == 'all' and self.all_locale is not None:
|
||||
return self.all_locale
|
||||
return locales.get_engine_locale(searxng_locale, self.languages, default=default)
|
||||
|
||||
def get_region(self, searxng_locale: str, default=None):
|
||||
"""Return engine's region string that best fits to SearXNG's locale.
|
||||
|
||||
:param searxng_locale: SearXNG's internal representation of locale
|
||||
selected by the user.
|
||||
|
||||
:param default: engine's default region
|
||||
|
||||
The *best fits* rules are implemented in
|
||||
:py:obj:`locales.get_engine_locale`. Except for the special value ``all``
|
||||
which is determined from :py:obj`EngineTraits.all_language`.
|
||||
"""
|
||||
if searxng_locale == 'all' and self.all_locale is not None:
|
||||
return self.all_locale
|
||||
return locales.get_engine_locale(searxng_locale, self.regions, default=default)
|
||||
|
||||
def is_locale_supported(self, searxng_locale: str) -> bool:
|
||||
"""A *locale* (SearXNG's internal representation) is considered to be supported
|
||||
by the engine if the *region* or the *language* is supported by the
|
||||
engine. For verification the functions :py:func:`self.get_region` and
|
||||
:py:func:`self.get_region` are used.
|
||||
"""
|
||||
if self.data_type == 'traits_v1':
|
||||
return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale))
|
||||
|
||||
raise TypeError('engine traits of type %s is unknown' % self.data_type)
|
||||
|
||||
def copy(self):
|
||||
"""Create a copy of the dataclass object."""
|
||||
return EngineTraits(**dataclasses.asdict(self))
|
||||
|
||||
@classmethod
|
||||
def fetch_traits(cls, engine: Engine) -> Union[Self, None]:
|
||||
"""Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch
|
||||
and set properties from the origin engine in the object ``engine_traits``. If
|
||||
function does not exists, ``None`` is returned.
|
||||
"""
|
||||
|
||||
fetch_traits = getattr(engine, 'fetch_traits', None)
|
||||
engine_traits = None
|
||||
|
||||
if fetch_traits:
|
||||
engine_traits = cls()
|
||||
fetch_traits(engine_traits)
|
||||
return engine_traits
|
||||
|
||||
def set_traits(self, engine: Engine):
|
||||
"""Set traits from self object in a :py:obj:`.Engine` namespace.
|
||||
|
||||
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
|
||||
"""
|
||||
|
||||
if self.data_type == 'traits_v1':
|
||||
self._set_traits_v1(engine)
|
||||
else:
|
||||
raise TypeError('engine traits of type %s is unknown' % self.data_type)
|
||||
|
||||
def _set_traits_v1(self, engine: Engine):
|
||||
# For an engine, when there is `language: ...` in the YAML settings the engine
|
||||
# does support only this one language (region)::
|
||||
#
|
||||
# - name: google italian
|
||||
# engine: google
|
||||
# language: it
|
||||
# region: it-IT
|
||||
|
||||
traits = self.copy()
|
||||
|
||||
_msg = "settings.yml - engine: '%s' / %s: '%s' not supported"
|
||||
|
||||
languages = traits.languages
|
||||
if hasattr(engine, 'language'):
|
||||
if engine.language not in languages:
|
||||
raise ValueError(_msg % (engine.name, 'language', engine.language))
|
||||
traits.languages = {engine.language: languages[engine.language]}
|
||||
|
||||
regions = traits.regions
|
||||
if hasattr(engine, 'region'):
|
||||
if engine.region not in regions:
|
||||
raise ValueError(_msg % (engine.name, 'region', engine.region))
|
||||
traits.regions = {engine.region: regions[engine.region]}
|
||||
|
||||
engine.language_support = bool(traits.languages or traits.regions)
|
||||
|
||||
# set the copied & modified traits in engine's namespace
|
||||
engine.traits = traits
|
||||
|
||||
|
||||
class EngineTraitsMap(Dict[str, EngineTraits]):
|
||||
"""A python dictionary to map :class:`EngineTraits` by engine name."""
|
||||
|
||||
ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve()
|
||||
"""File with persistence of the :py:obj:`EngineTraitsMap`."""
|
||||
|
||||
def save_data(self):
|
||||
"""Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`"""
|
||||
with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder)
|
||||
|
||||
@classmethod
|
||||
def from_data(cls) -> Self:
|
||||
"""Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`"""
|
||||
obj = cls()
|
||||
for k, v in ENGINE_TRAITS.items():
|
||||
obj[k] = EngineTraits(**v)
|
||||
return obj
|
||||
|
||||
@classmethod
|
||||
def fetch_traits(cls, log: Callable) -> Self:
|
||||
from searx import engines # pylint: disable=cyclic-import, import-outside-toplevel
|
||||
|
||||
names = list(engines.engines)
|
||||
names.sort()
|
||||
obj = cls()
|
||||
|
||||
for engine_name in names:
|
||||
engine = engines.engines[engine_name]
|
||||
|
||||
traits = EngineTraits.fetch_traits(engine)
|
||||
if traits is not None:
|
||||
log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages)))
|
||||
log("%-20s: SearXNG regions --> %s" % (engine_name, len(traits.regions)))
|
||||
obj[engine_name] = traits
|
||||
|
||||
return obj
|
||||
|
||||
def set_traits(self, engine: Engine):
|
||||
"""Set traits in a :py:obj:`Engine` namespace.
|
||||
|
||||
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
|
||||
"""
|
||||
|
||||
engine_traits = EngineTraits(data_type='traits_v1')
|
||||
if engine.name in self.keys():
|
||||
engine_traits = self[engine.name]
|
||||
|
||||
elif engine.engine in self.keys():
|
||||
# The key of the dictionary traits_map is the *engine name*
|
||||
# configured in settings.xml. When multiple engines are configured
|
||||
# in settings.yml to use the same origin engine (python module)
|
||||
# these additional engines can use the languages from the origin
|
||||
# engine. For this use the configured ``engine: ...`` from
|
||||
# settings.yml
|
||||
engine_traits = self[engine.engine]
|
||||
|
||||
engine_traits.set_traits(engine)
|
|
@ -11,24 +11,22 @@ usage::
|
|||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import copy
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from os.path import realpath, dirname
|
||||
from babel.localedata import locale_identifiers
|
||||
from searx import logger, settings
|
||||
from searx.data import ENGINES_LANGUAGES
|
||||
from searx.network import get
|
||||
from searx.utils import load_module, match_language, gen_useragent
|
||||
|
||||
from typing import TYPE_CHECKING, Dict, Optional
|
||||
|
||||
from searx import logger, settings
|
||||
from searx.utils import load_module
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from searx.enginelib import Engine
|
||||
|
||||
logger = logger.getChild('engines')
|
||||
ENGINE_DIR = dirname(realpath(__file__))
|
||||
BABEL_LANGS = [
|
||||
lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
|
||||
for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())
|
||||
]
|
||||
ENGINE_DEFAULT_ARGS = {
|
||||
"engine_type": "online",
|
||||
"inactive": False,
|
||||
|
@ -36,8 +34,6 @@ ENGINE_DEFAULT_ARGS = {
|
|||
"timeout": settings["outgoing"]["request_timeout"],
|
||||
"shortcut": "-",
|
||||
"categories": ["general"],
|
||||
"supported_languages": [],
|
||||
"language_aliases": {},
|
||||
"paging": False,
|
||||
"safesearch": False,
|
||||
"time_range_support": False,
|
||||
|
@ -52,24 +48,6 @@ ENGINE_DEFAULT_ARGS = {
|
|||
OTHER_CATEGORY = 'other'
|
||||
|
||||
|
||||
class Engine: # pylint: disable=too-few-public-methods
|
||||
"""This class is currently never initialized and only used for type hinting."""
|
||||
|
||||
name: str
|
||||
engine: str
|
||||
shortcut: str
|
||||
categories: List[str]
|
||||
supported_languages: List[str]
|
||||
about: dict
|
||||
inactive: bool
|
||||
disabled: bool
|
||||
language_support: bool
|
||||
paging: bool
|
||||
safesearch: bool
|
||||
time_range_support: bool
|
||||
timeout: float
|
||||
|
||||
|
||||
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
|
||||
|
||||
categories = {'general': []}
|
||||
|
@ -136,9 +114,15 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
|
|||
return None
|
||||
|
||||
update_engine_attributes(engine, engine_data)
|
||||
set_language_attributes(engine)
|
||||
update_attributes_for_tor(engine)
|
||||
|
||||
# avoid cyclic imports
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from searx.enginelib.traits import EngineTraitsMap
|
||||
|
||||
trait_map = EngineTraitsMap.from_data()
|
||||
trait_map.set_traits(engine)
|
||||
|
||||
if not is_engine_active(engine):
|
||||
return None
|
||||
|
||||
|
@ -190,60 +174,6 @@ def update_engine_attributes(engine: Engine, engine_data):
|
|||
setattr(engine, arg_name, copy.deepcopy(arg_value))
|
||||
|
||||
|
||||
def set_language_attributes(engine: Engine):
|
||||
# assign supported languages from json file
|
||||
if engine.name in ENGINES_LANGUAGES:
|
||||
engine.supported_languages = ENGINES_LANGUAGES[engine.name]
|
||||
|
||||
elif engine.engine in ENGINES_LANGUAGES:
|
||||
# The key of the dictionary ENGINES_LANGUAGES is the *engine name*
|
||||
# configured in settings.xml. When multiple engines are configured in
|
||||
# settings.yml to use the same origin engine (python module) these
|
||||
# additional engines can use the languages from the origin engine.
|
||||
# For this use the configured ``engine: ...`` from settings.yml
|
||||
engine.supported_languages = ENGINES_LANGUAGES[engine.engine]
|
||||
|
||||
if hasattr(engine, 'language'):
|
||||
# For an engine, when there is `language: ...` in the YAML settings, the
|
||||
# engine supports only one language, in this case
|
||||
# engine.supported_languages should contains this value defined in
|
||||
# settings.yml
|
||||
if engine.language not in engine.supported_languages:
|
||||
raise ValueError(
|
||||
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
|
||||
)
|
||||
|
||||
if isinstance(engine.supported_languages, dict):
|
||||
engine.supported_languages = {engine.language: engine.supported_languages[engine.language]}
|
||||
else:
|
||||
engine.supported_languages = [engine.language]
|
||||
|
||||
# find custom aliases for non standard language codes
|
||||
for engine_lang in engine.supported_languages:
|
||||
iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None)
|
||||
if (
|
||||
iso_lang
|
||||
and iso_lang != engine_lang
|
||||
and not engine_lang.startswith(iso_lang)
|
||||
and iso_lang not in engine.supported_languages
|
||||
):
|
||||
engine.language_aliases[iso_lang] = engine_lang
|
||||
|
||||
# language_support
|
||||
engine.language_support = len(engine.supported_languages) > 0
|
||||
|
||||
# assign language fetching method if auxiliary method exists
|
||||
if hasattr(engine, '_fetch_supported_languages'):
|
||||
headers = {
|
||||
'User-Agent': gen_useragent(),
|
||||
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
|
||||
}
|
||||
engine.fetch_supported_languages = (
|
||||
# pylint: disable=protected-access
|
||||
lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers))
|
||||
)
|
||||
|
||||
|
||||
def update_attributes_for_tor(engine: Engine) -> bool:
|
||||
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
|
||||
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '')
|
||||
|
|
|
@ -1,15 +1,32 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Arch Linux Wiki
|
||||
Arch Linux Wiki
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
This implementation does not use a official API: Mediawiki provides API, but
|
||||
Arch Wiki blocks access to it.
|
||||
|
||||
API: Mediawiki provides API, but Arch Wiki blocks access to it
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode, urljoin
|
||||
from lxml import html
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode, urljoin, urlparse
|
||||
import lxml
|
||||
import babel
|
||||
|
||||
from searx import network
|
||||
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.locales import language_tag
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://wiki.archlinux.org/',
|
||||
"wikidata_id": 'Q101445877',
|
||||
|
@ -22,125 +39,113 @@ about = {
|
|||
# engine dependent config
|
||||
categories = ['it', 'software wikis']
|
||||
paging = True
|
||||
base_url = 'https://wiki.archlinux.org'
|
||||
|
||||
# xpath queries
|
||||
xpath_results = '//ul[@class="mw-search-results"]/li'
|
||||
xpath_link = './/div[@class="mw-search-result-heading"]/a'
|
||||
main_wiki = 'wiki.archlinux.org'
|
||||
|
||||
|
||||
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
|
||||
def locale_to_lang_code(locale):
|
||||
if locale.find('-') >= 0:
|
||||
locale = locale.split('-')[0]
|
||||
return locale
|
||||
|
||||
|
||||
# wikis for some languages were moved off from the main site, we need to make
|
||||
# requests to correct URLs to be able to get results in those languages
|
||||
lang_urls = {
|
||||
# fmt: off
|
||||
'all': {
|
||||
'base': 'https://wiki.archlinux.org',
|
||||
'search': '/index.php?title=Special:Search&offset={offset}&{query}'
|
||||
},
|
||||
'de': {
|
||||
'base': 'https://wiki.archlinux.de',
|
||||
'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}'
|
||||
},
|
||||
'fr': {
|
||||
'base': 'https://wiki.archlinux.fr',
|
||||
'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}'
|
||||
},
|
||||
'ja': {
|
||||
'base': 'https://wiki.archlinuxjp.org',
|
||||
'search': '/index.php?title=特別:検索&offset={offset}&{query}'
|
||||
},
|
||||
'ro': {
|
||||
'base': 'http://wiki.archlinux.ro',
|
||||
'search': '/index.php?title=Special:Căutare&offset={offset}&{query}'
|
||||
},
|
||||
'tr': {
|
||||
'base': 'http://archtr.org/wiki',
|
||||
'search': '/index.php?title=Özel:Ara&offset={offset}&{query}'
|
||||
}
|
||||
# fmt: on
|
||||
}
|
||||
|
||||
|
||||
# get base & search URLs for selected language
|
||||
def get_lang_urls(language):
|
||||
if language in lang_urls:
|
||||
return lang_urls[language]
|
||||
return lang_urls['all']
|
||||
|
||||
|
||||
# Language names to build search requests for
|
||||
# those languages which are hosted on the main site.
|
||||
main_langs = {
|
||||
'ar': 'العربية',
|
||||
'bg': 'Български',
|
||||
'cs': 'Česky',
|
||||
'da': 'Dansk',
|
||||
'el': 'Ελληνικά',
|
||||
'es': 'Español',
|
||||
'he': 'עברית',
|
||||
'hr': 'Hrvatski',
|
||||
'hu': 'Magyar',
|
||||
'it': 'Italiano',
|
||||
'ko': '한국어',
|
||||
'lt': 'Lietuviškai',
|
||||
'nl': 'Nederlands',
|
||||
'pl': 'Polski',
|
||||
'pt': 'Português',
|
||||
'ru': 'Русский',
|
||||
'sl': 'Slovenský',
|
||||
'th': 'ไทย',
|
||||
'uk': 'Українська',
|
||||
'zh': '简体中文',
|
||||
}
|
||||
supported_languages = dict(lang_urls, **main_langs)
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
# translate the locale (e.g. 'en-US') to language code ('en')
|
||||
language = locale_to_lang_code(params['language'])
|
||||
|
||||
# if our language is hosted on the main site, we need to add its name
|
||||
# to the query in order to narrow the results to that language
|
||||
if language in main_langs:
|
||||
query += ' (' + main_langs[language] + ')'
|
||||
|
||||
# prepare the request parameters
|
||||
query = urlencode({'search': query})
|
||||
sxng_lang = params['searxng_locale'].split('-')[0]
|
||||
netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki)
|
||||
title = traits.custom['title'].get(sxng_lang, 'Special:Search')
|
||||
base_url = 'https://' + netloc + '/index.php?'
|
||||
offset = (params['pageno'] - 1) * 20
|
||||
|
||||
# get request URLs for our language of choice
|
||||
urls = get_lang_urls(language)
|
||||
search_url = urls['base'] + urls['search']
|
||||
if netloc == main_wiki:
|
||||
eng_lang: str = traits.get_language(sxng_lang, 'English')
|
||||
query += ' (' + eng_lang + ')'
|
||||
elif netloc == 'wiki.archlinuxcn.org':
|
||||
base_url = 'https://' + netloc + '/wzh/index.php?'
|
||||
|
||||
params['url'] = search_url.format(query=query, offset=offset)
|
||||
args = {
|
||||
'search': query,
|
||||
'title': title,
|
||||
'limit': 20,
|
||||
'offset': offset,
|
||||
'profile': 'default',
|
||||
}
|
||||
|
||||
params['url'] = base_url + urlencode(args)
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
# get the base URL for the language in which request was made
|
||||
language = locale_to_lang_code(resp.search_params['language'])
|
||||
base_url = get_lang_urls(language)['base']
|
||||
|
||||
results = []
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
# get the base URL for the language in which request was made
|
||||
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
|
||||
netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki)
|
||||
base_url = 'https://' + netloc + '/index.php?'
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath_list(dom, xpath_results):
|
||||
link = eval_xpath_getindex(result, xpath_link, 0)
|
||||
href = urljoin(base_url, link.attrib.get('href'))
|
||||
title = extract_text(link)
|
||||
|
||||
results.append({'url': href, 'title': title})
|
||||
for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
|
||||
link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
|
||||
content = extract_text(result.xpath('.//div[@class="searchresult"]'))
|
||||
results.append(
|
||||
{
|
||||
'url': urljoin(base_url, link.get('href')),
|
||||
'title': extract_text(link),
|
||||
'content': content,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages from Archlinix-Wiki. The location of the Wiki address of a
|
||||
language is mapped in a :py:obj:`custom field
|
||||
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
|
||||
on the location, the ``title`` argument in the request is translated.
|
||||
|
||||
.. code:: python
|
||||
|
||||
"custom": {
|
||||
"wiki_netloc": {
|
||||
"de": "wiki.archlinux.de",
|
||||
# ...
|
||||
"zh": "wiki.archlinuxcn.org"
|
||||
}
|
||||
"title": {
|
||||
"de": "Spezial:Suche",
|
||||
# ...
|
||||
"zh": "Special:\u641c\u7d22"
|
||||
},
|
||||
},
|
||||
|
||||
"""
|
||||
|
||||
engine_traits.custom['wiki_netloc'] = {}
|
||||
engine_traits.custom['title'] = {}
|
||||
|
||||
title_map = {
|
||||
'de': 'Spezial:Suche',
|
||||
'fa': 'ویژه:جستجو',
|
||||
'ja': '特別:検索',
|
||||
'zh': 'Special:搜索',
|
||||
}
|
||||
|
||||
resp = network.get('https://wiki.archlinux.org/')
|
||||
if not resp.ok:
|
||||
print("ERROR: response from wiki.archlinix.org is not OK.")
|
||||
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
|
||||
|
||||
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
|
||||
# zh_Hans --> zh
|
||||
sxng_tag = sxng_tag.split('_')[0]
|
||||
|
||||
netloc = urlparse(a.get('href')).netloc
|
||||
if netloc != 'wiki.archlinux.org':
|
||||
title = title_map.get(sxng_tag)
|
||||
if not title:
|
||||
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
|
||||
continue
|
||||
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
|
||||
engine_traits.custom['title'][sxng_tag] = title
|
||||
|
||||
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
|
||||
engine_traits.languages[sxng_tag] = eng_tag
|
||||
|
||||
engine_traits.languages['en'] = 'English'
|
||||
|
|
|
@ -1,16 +1,53 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Bing (Web)
|
||||
"""This is the implementation of the Bing-WEB engine. Some of this
|
||||
implementations are shared by other engines:
|
||||
|
||||
- :ref:`bing images engine`
|
||||
- :ref:`bing news engine`
|
||||
- :ref:`bing videos engine`
|
||||
|
||||
On the `preference page`_ Bing offers a lot of languages an regions (see section
|
||||
'Search results languages' and 'Country/region'). However, the abundant choice
|
||||
does not correspond to reality, where Bing has a full-text indexer only for a
|
||||
limited number of languages. By example: you can select a language like Māori
|
||||
but you never get a result in this language.
|
||||
|
||||
What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
|
||||
to be completely correct either (if you take a closer look you will find some
|
||||
inaccuracies there too):
|
||||
|
||||
- :py:obj:`searx.engines.bing.bing_traits_url`
|
||||
- :py:obj:`searx.engines.bing_videos.bing_traits_url`
|
||||
- :py:obj:`searx.engines.bing_images.bing_traits_url`
|
||||
- :py:obj:`searx.engines.bing_news.bing_traits_url`
|
||||
|
||||
.. _preference page: https://www.bing.com/account/general
|
||||
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
|
||||
|
||||
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
|
||||
"""
|
||||
# pylint: disable=too-many-branches
|
||||
# pylint: disable=too-many-branches, invalid-name
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import datetime
|
||||
import re
|
||||
from urllib.parse import urlencode, urlparse, parse_qs
|
||||
import uuid
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
|
||||
from searx.network import multi_requests, Request
|
||||
import babel
|
||||
import babel.languages
|
||||
|
||||
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
|
||||
from searx import network
|
||||
from searx.locales import language_tag, region_tag
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
about = {
|
||||
"website": 'https://www.bing.com',
|
||||
|
@ -21,56 +58,124 @@ about = {
|
|||
"results": 'HTML',
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
"""Bing tries to guess user's language and territory from the HTTP
|
||||
Accept-Language. Optional the user can select a search-language (can be
|
||||
different to the UI language) and a region (market code)."""
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
time_range_support = False
|
||||
safesearch = False
|
||||
send_accept_language_header = True
|
||||
supported_languages_url = 'https://www.bing.com/account/general'
|
||||
language_aliases = {}
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.bing.com/'
|
||||
base_url = 'https://www.bing.com/search'
|
||||
"""Bing (Web) search URL"""
|
||||
|
||||
# initial query: https://www.bing.com/search?q=foo&search=&form=QBLH
|
||||
inital_query = 'search?{query}&search=&form=QBLH'
|
||||
|
||||
# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
|
||||
page_query = 'search?{query}&search=&first={offset}&FORM=PERE'
|
||||
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
|
||||
"""Bing (Web) search API description"""
|
||||
|
||||
|
||||
def _get_offset_from_pageno(pageno):
|
||||
return (pageno - 1) * 10 + 1
|
||||
|
||||
|
||||
def set_bing_cookies(params, engine_language, engine_region, SID):
|
||||
|
||||
# set cookies
|
||||
# -----------
|
||||
|
||||
params['cookies']['_EDGE_V'] = '1'
|
||||
|
||||
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
|
||||
_EDGE_S = [
|
||||
'F=1',
|
||||
'SID=%s' % SID,
|
||||
'mkt=%s' % engine_region.lower(),
|
||||
'ui=%s' % engine_language.lower(),
|
||||
]
|
||||
params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
|
||||
logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
|
||||
|
||||
# "_EDGE_CD": "m=zh-tw",
|
||||
|
||||
_EDGE_CD = [ # pylint: disable=invalid-name
|
||||
'm=%s' % engine_region.lower(), # search region: zh-cn
|
||||
'u=%s' % engine_language.lower(), # UI: en-us
|
||||
]
|
||||
|
||||
params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
|
||||
logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
|
||||
|
||||
SRCHHPGUSR = [ # pylint: disable=invalid-name
|
||||
'SRCHLANG=%s' % engine_language,
|
||||
# Trying to set ADLT cookie here seems not to have any effect, I assume
|
||||
# there is some age verification by a cookie (and/or session ID) needed,
|
||||
# to disable the SafeSearch.
|
||||
'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
|
||||
]
|
||||
params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
|
||||
logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Assemble a Bing-Web request."""
|
||||
|
||||
offset = _get_offset_from_pageno(params.get('pageno', 1))
|
||||
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
||||
|
||||
# logger.debug("params['pageno'] --> %s", params.get('pageno'))
|
||||
# logger.debug(" offset --> %s", offset)
|
||||
SID = uuid.uuid1().hex.upper()
|
||||
CVID = uuid.uuid1().hex.upper()
|
||||
|
||||
search_string = page_query
|
||||
if offset == 1:
|
||||
search_string = inital_query
|
||||
set_bing_cookies(params, engine_language, engine_region, SID)
|
||||
|
||||
if params['language'] == 'all':
|
||||
lang = 'EN'
|
||||
else:
|
||||
lang = match_language(params['language'], supported_languages, language_aliases)
|
||||
# build URL query
|
||||
# ---------------
|
||||
|
||||
query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
|
||||
# query term
|
||||
page = int(params.get('pageno', 1))
|
||||
query_params = {
|
||||
# fmt: off
|
||||
'q': query,
|
||||
'pq': query,
|
||||
'cvid': CVID,
|
||||
'qs': 'n',
|
||||
'sp': '-1'
|
||||
# fmt: on
|
||||
}
|
||||
|
||||
search_path = search_string.format(query=urlencode({'q': query}), offset=offset)
|
||||
|
||||
if offset > 1:
|
||||
referer = base_url + inital_query.format(query=urlencode({'q': query}))
|
||||
# page
|
||||
if page > 1:
|
||||
referer = base_url + '?' + urlencode(query_params)
|
||||
params['headers']['Referer'] = referer
|
||||
logger.debug("headers.Referer --> %s", referer)
|
||||
|
||||
params['url'] = base_url + search_path
|
||||
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
query_params['first'] = _get_offset_from_pageno(page)
|
||||
|
||||
if page == 2:
|
||||
query_params['FORM'] = 'PERE'
|
||||
elif page > 2:
|
||||
query_params['FORM'] = 'PERE%s' % (page - 2)
|
||||
|
||||
filters = ''
|
||||
if params['time_range']:
|
||||
query_params['filt'] = 'custom'
|
||||
|
||||
if params['time_range'] == 'day':
|
||||
filters = 'ex1:"ez1"'
|
||||
elif params['time_range'] == 'week':
|
||||
filters = 'ex1:"ez2"'
|
||||
elif params['time_range'] == 'month':
|
||||
filters = 'ex1:"ez3"'
|
||||
elif params['time_range'] == 'year':
|
||||
epoch_1970 = datetime.date(1970, 1, 1)
|
||||
today_no = (datetime.date.today() - epoch_1970).days
|
||||
filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(query_params)
|
||||
if filters:
|
||||
params['url'] = params['url'] + '&filters=' + filters
|
||||
return params
|
||||
|
||||
|
||||
|
@ -107,7 +212,8 @@ def response(resp):
|
|||
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
|
||||
# Bing can shorten the URL either at the end or in the middle of the string
|
||||
if (
|
||||
url_cite.startswith('https://')
|
||||
url_cite
|
||||
and url_cite.startswith('https://')
|
||||
and '…' not in url_cite
|
||||
and '...' not in url_cite
|
||||
and '›' not in url_cite
|
||||
|
@ -127,9 +233,9 @@ def response(resp):
|
|||
|
||||
# resolve all Bing redirections in parallel
|
||||
request_list = [
|
||||
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
|
||||
network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
|
||||
]
|
||||
response_list = multi_requests(request_list)
|
||||
response_list = network.multi_requests(request_list)
|
||||
for i, redirect_response in enumerate(response_list):
|
||||
if not isinstance(redirect_response, Exception):
|
||||
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
|
||||
|
@ -157,27 +263,71 @@ def response(resp):
|
|||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def _fetch_supported_languages(resp):
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages and regions from Bing-Web."""
|
||||
|
||||
lang_tags = set()
|
||||
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
|
||||
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
|
||||
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
|
||||
|
||||
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
|
||||
|
||||
|
||||
def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
|
||||
|
||||
# insert alias to map from a language (zh) to a language + script (zh_Hans)
|
||||
engine_traits.languages['zh'] = 'zh-hans'
|
||||
|
||||
resp = network.get(url)
|
||||
|
||||
if not resp.ok:
|
||||
print("ERROR: response from peertube is not OK.")
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')
|
||||
|
||||
for _li in lang_links:
|
||||
map_lang = {'jp': 'ja'}
|
||||
for td in eval_xpath(dom, xpath_language_codes):
|
||||
eng_lang = td.text
|
||||
|
||||
href = eval_xpath(_li, './/@href')[0]
|
||||
(_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href)
|
||||
query = parse_qs(query, keep_blank_values=True)
|
||||
if eng_lang in ('en-gb', 'pt-br'):
|
||||
# language 'en' is already in the list and a language 'en-gb' can't
|
||||
# be handled in SearXNG, same with pt-br which is covered by pt-pt.
|
||||
continue
|
||||
|
||||
# fmt: off
|
||||
setlang = query.get('setlang', [None, ])[0]
|
||||
# example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
|
||||
lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2] # fmt: skip
|
||||
# fmt: on
|
||||
babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
|
||||
try:
|
||||
sxng_tag = language_tag(babel.Locale.parse(babel_lang))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: language (%s) is unknown by babel" % (eng_lang))
|
||||
continue
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_lang:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
|
||||
continue
|
||||
engine_traits.languages[sxng_tag] = eng_lang
|
||||
|
||||
tag = lang + '-' + nation if nation else lang
|
||||
lang_tags.add(tag)
|
||||
map_region = {
|
||||
'en-ID': 'id_ID',
|
||||
'no-NO': 'nb_NO',
|
||||
}
|
||||
|
||||
return list(lang_tags)
|
||||
for td in eval_xpath(dom, xpath_market_codes):
|
||||
eng_region = td.text
|
||||
babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
|
||||
|
||||
if eng_region == 'en-WW':
|
||||
engine_traits.all_locale = eng_region
|
||||
continue
|
||||
|
||||
try:
|
||||
sxng_tag = region_tag(babel.Locale.parse(babel_region))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: region (%s) is unknown by babel" % (eng_region))
|
||||
continue
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_region:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = eng_region
|
||||
|
|
|
@ -1,20 +1,30 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Bing (Images)
|
||||
|
||||
"""Bing-Images: description see :py:obj:`searx.engines.bing`.
|
||||
"""
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
from json import loads
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import uuid
|
||||
import json
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import match_language
|
||||
from searx.engines.bing import language_aliases
|
||||
from searx.engines.bing import ( # pylint: disable=unused-import
|
||||
_fetch_supported_languages,
|
||||
supported_languages_url,
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.engines.bing import (
|
||||
set_bing_cookies,
|
||||
_fetch_traits,
|
||||
)
|
||||
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -31,77 +41,92 @@ categories = ['images', 'web']
|
|||
paging = True
|
||||
safesearch = True
|
||||
time_range_support = True
|
||||
send_accept_language_header = True
|
||||
supported_languages_url = 'https://www.bing.com/account/general'
|
||||
number_of_results = 28
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.bing.com/'
|
||||
search_string = (
|
||||
base_url = 'https://www.bing.com/images/async'
|
||||
"""Bing (Images) search URL"""
|
||||
|
||||
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes'
|
||||
"""Bing (Images) search API description"""
|
||||
|
||||
time_map = {
|
||||
# fmt: off
|
||||
'images/search'
|
||||
'?{query}'
|
||||
'&count={count}'
|
||||
'&first={first}'
|
||||
'&tsc=ImageHoverTitle'
|
||||
'day': 60 * 24,
|
||||
'week': 60 * 24 * 7,
|
||||
'month': 60 * 24 * 31,
|
||||
'year': 60 * 24 * 365,
|
||||
# fmt: on
|
||||
)
|
||||
time_range_string = '&qft=+filterui:age-lt{interval}'
|
||||
time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'}
|
||||
|
||||
# safesearch definitions
|
||||
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}
|
||||
}
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
offset = ((params['pageno'] - 1) * number_of_results) + 1
|
||||
"""Assemble a Bing-Image request."""
|
||||
|
||||
search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset)
|
||||
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
||||
|
||||
language = match_language(params['language'], supported_languages, language_aliases).lower()
|
||||
SID = uuid.uuid1().hex.upper()
|
||||
set_bing_cookies(params, engine_language, engine_region, SID)
|
||||
|
||||
params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
|
||||
# build URL query
|
||||
# - example: https://www.bing.com/images/async?q=foo&first=155&count=35
|
||||
|
||||
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&ui=' + language + '&F=1'
|
||||
query_params = {
|
||||
# fmt: off
|
||||
'q': query,
|
||||
'async' : 'content',
|
||||
# to simplify the page count lets use the default of 35 images per page
|
||||
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
|
||||
'count' : 35,
|
||||
# fmt: on
|
||||
}
|
||||
|
||||
params['url'] = base_url + search_path
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
|
||||
# time range
|
||||
# - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
|
||||
|
||||
if params['time_range']:
|
||||
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(query_params)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
"""Get response from Bing-Images"""
|
||||
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath('//div[@class="imgpt"]'):
|
||||
img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0]
|
||||
# Microsoft seems to experiment with this code so don't make the path too specific,
|
||||
# just catch the text section for the first anchor in img_info assuming this to be
|
||||
# the originating site.
|
||||
source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0]
|
||||
for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
|
||||
|
||||
m = loads(result.xpath('./a/@m')[0])
|
||||
metadata = result.xpath('.//a[@class="iusc"]/@m')
|
||||
if not metadata:
|
||||
continue
|
||||
|
||||
# strip 'Unicode private use area' highlighting, they render to Tux
|
||||
# the Linux penguin and a standing diamond on my machine...
|
||||
title = m.get('t', '').replace('\ue000', '').replace('\ue001', '')
|
||||
metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
|
||||
title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
|
||||
img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip()
|
||||
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'url': m['purl'],
|
||||
'thumbnail_src': m['turl'],
|
||||
'img_src': m['murl'],
|
||||
'content': '',
|
||||
'url': metadata['purl'],
|
||||
'thumbnail_src': metadata['turl'],
|
||||
'img_src': metadata['murl'],
|
||||
'content': metadata['desc'],
|
||||
'title': title,
|
||||
'source': source,
|
||||
'img_format': img_format,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages and regions from Bing-News."""
|
||||
|
||||
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
|
||||
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
|
||||
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
|
||||
|
||||
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
|
||||
|
|
|
@ -1,24 +1,30 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Bing (News)
|
||||
"""Bing-News: description see :py:obj:`searx.engines.bing`.
|
||||
"""
|
||||
|
||||
from urllib.parse import (
|
||||
urlencode,
|
||||
urlparse,
|
||||
parse_qsl,
|
||||
quote,
|
||||
)
|
||||
from datetime import datetime
|
||||
from dateutil import parser
|
||||
from lxml import etree
|
||||
from lxml.etree import XPath
|
||||
from searx.utils import match_language, eval_xpath_getindex
|
||||
from searx.engines.bing import ( # pylint: disable=unused-import
|
||||
language_aliases,
|
||||
_fetch_supported_languages,
|
||||
supported_languages_url,
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
import uuid
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import html
|
||||
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.engines.bing import (
|
||||
set_bing_cookies,
|
||||
_fetch_traits,
|
||||
)
|
||||
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -34,108 +40,111 @@ about = {
|
|||
categories = ['news']
|
||||
paging = True
|
||||
time_range_support = True
|
||||
send_accept_language_header = True
|
||||
time_map = {
|
||||
'day': '4',
|
||||
'week': '8',
|
||||
'month': '9',
|
||||
}
|
||||
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
|
||||
difference of *last day* and *last week* in the result list is just marginally.
|
||||
"""
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.bing.com/'
|
||||
search_string = 'news/search?{query}&first={offset}&format=RSS'
|
||||
search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
|
||||
time_range_dict = {'day': '7', 'week': '8', 'month': '9'}
|
||||
base_url = 'https://www.bing.com/news/infinitescrollajax'
|
||||
"""Bing (News) search URL"""
|
||||
|
||||
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes'
|
||||
"""Bing (News) search API description"""
|
||||
|
||||
def url_cleanup(url_string):
|
||||
"""remove click"""
|
||||
|
||||
parsed_url = urlparse(url_string)
|
||||
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
|
||||
query = dict(parse_qsl(parsed_url.query))
|
||||
url_string = query.get('url', None)
|
||||
return url_string
|
||||
|
||||
|
||||
def image_url_cleanup(url_string):
|
||||
"""replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=..."""
|
||||
|
||||
parsed_url = urlparse(url_string)
|
||||
if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th':
|
||||
query = dict(parse_qsl(parsed_url.query))
|
||||
url_string = "https://www.bing.com/th?id=" + quote(query.get('id'))
|
||||
return url_string
|
||||
|
||||
|
||||
def _get_url(query, language, offset, time_range):
|
||||
if time_range in time_range_dict:
|
||||
search_path = search_string_with_time.format(
|
||||
# fmt: off
|
||||
query = urlencode({
|
||||
'q': query,
|
||||
'setmkt': language
|
||||
}),
|
||||
offset = offset,
|
||||
interval = time_range_dict[time_range]
|
||||
# fmt: on
|
||||
)
|
||||
else:
|
||||
# e.g. setmkt=de-de&setlang=de
|
||||
search_path = search_string.format(
|
||||
# fmt: off
|
||||
query = urlencode({
|
||||
'q': query,
|
||||
'setmkt': language
|
||||
}),
|
||||
offset = offset
|
||||
# fmt: on
|
||||
)
|
||||
return base_url + search_path
|
||||
mkt_alias = {
|
||||
'zh': 'en-WW',
|
||||
'zh-CN': 'en-WW',
|
||||
}
|
||||
"""Bing News has an official market code 'zh-CN' but we won't get a result with
|
||||
this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate*
|
||||
market code (en-WW).
|
||||
"""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Assemble a Bing-News request."""
|
||||
|
||||
if params['time_range'] and params['time_range'] not in time_range_dict:
|
||||
return params
|
||||
sxng_locale = params['searxng_locale']
|
||||
engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale)
|
||||
engine_language = traits.get_language(sxng_locale, 'en')
|
||||
|
||||
offset = (params['pageno'] - 1) * 10 + 1
|
||||
if params['language'] == 'all':
|
||||
language = 'en-US'
|
||||
else:
|
||||
language = match_language(params['language'], supported_languages, language_aliases)
|
||||
params['url'] = _get_url(query, language, offset, params['time_range'])
|
||||
SID = uuid.uuid1().hex.upper()
|
||||
set_bing_cookies(params, engine_language, engine_region, SID)
|
||||
|
||||
# build URL query
|
||||
#
|
||||
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
|
||||
|
||||
query_params = {
|
||||
# fmt: off
|
||||
'q': query,
|
||||
'InfiniteScroll': 1,
|
||||
# to simplify the page count lets use the default of 10 images per page
|
||||
'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1,
|
||||
# fmt: on
|
||||
}
|
||||
|
||||
if params['time_range']:
|
||||
# qft=interval:"7"
|
||||
query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9')
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(query_params)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
"""Get response from Bing-Video"""
|
||||
results = []
|
||||
rss = etree.fromstring(resp.content)
|
||||
namespaces = rss.nsmap
|
||||
|
||||
for item in rss.xpath('./channel/item'):
|
||||
# url / title / content
|
||||
url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None))
|
||||
title = eval_xpath_getindex(item, './title/text()', 0, default=url)
|
||||
content = eval_xpath_getindex(item, './description/text()', 0, default='')
|
||||
if not resp.ok or not resp.text:
|
||||
return results
|
||||
|
||||
# publishedDate
|
||||
publishedDate = eval_xpath_getindex(item, './pubDate/text()', 0, default=None)
|
||||
try:
|
||||
publishedDate = parser.parse(publishedDate, dayfirst=False)
|
||||
except TypeError:
|
||||
publishedDate = datetime.now()
|
||||
except ValueError:
|
||||
publishedDate = datetime.now()
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# thumbnail
|
||||
thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=namespaces), 0, default=None)
|
||||
if thumbnail is not None:
|
||||
thumbnail = image_url_cleanup(thumbnail)
|
||||
for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'):
|
||||
|
||||
# append result
|
||||
if thumbnail is not None:
|
||||
results.append(
|
||||
{'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content, 'img_src': thumbnail}
|
||||
)
|
||||
else:
|
||||
results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content})
|
||||
url = newsitem.xpath('./@url')[0]
|
||||
title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip()
|
||||
content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip()
|
||||
thumbnail = None
|
||||
author = newsitem.xpath('./@data-author')[0]
|
||||
metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip()
|
||||
|
||||
img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src')
|
||||
if img_src:
|
||||
thumbnail = 'https://www.bing.com/' + img_src[0]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'img_src': thumbnail,
|
||||
'author': author,
|
||||
'metadata': metadata,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages and regions from Bing-News.
|
||||
|
||||
The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the
|
||||
first table says *"query parameter when calling the Video Search API."*
|
||||
.. thats why I use the 4. table "News Category API markets" for the
|
||||
``xpath_market_codes``.
|
||||
|
||||
"""
|
||||
|
||||
xpath_market_codes = '//table[4]/tbody/tr/td[3]'
|
||||
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
|
||||
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
|
||||
|
||||
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
|
||||
|
|
|
@ -1,21 +1,30 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Bing (Videos)
|
||||
|
||||
"""Bing-Videos: description see :py:obj:`searx.engines.bing`.
|
||||
"""
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
from json import loads
|
||||
from typing import TYPE_CHECKING
|
||||
import uuid
|
||||
import json
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import match_language
|
||||
from searx.engines.bing import language_aliases
|
||||
|
||||
from searx.engines.bing import ( # pylint: disable=unused-import
|
||||
_fetch_supported_languages,
|
||||
supported_languages_url,
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.engines.bing import (
|
||||
set_bing_cookies,
|
||||
_fetch_traits,
|
||||
)
|
||||
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
about = {
|
||||
"website": 'https://www.bing.com/videos',
|
||||
|
@ -26,65 +35,76 @@ about = {
|
|||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos', 'web']
|
||||
paging = True
|
||||
safesearch = True
|
||||
time_range_support = True
|
||||
send_accept_language_header = True
|
||||
number_of_results = 28
|
||||
|
||||
base_url = 'https://www.bing.com/'
|
||||
search_string = (
|
||||
base_url = 'https://www.bing.com/videos/asyncv2'
|
||||
"""Bing (Videos) async search URL."""
|
||||
|
||||
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes'
|
||||
"""Bing (Video) search API description"""
|
||||
|
||||
time_map = {
|
||||
# fmt: off
|
||||
'videos/search'
|
||||
'?{query}'
|
||||
'&count={count}'
|
||||
'&first={first}'
|
||||
'&scope=video'
|
||||
'&FORM=QBLH'
|
||||
'day': 60 * 24,
|
||||
'week': 60 * 24 * 7,
|
||||
'month': 60 * 24 * 31,
|
||||
'year': 60 * 24 * 365,
|
||||
# fmt: on
|
||||
)
|
||||
time_range_string = '&qft=+filterui:videoage-lt{interval}'
|
||||
time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'}
|
||||
|
||||
# safesearch definitions
|
||||
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}
|
||||
}
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
offset = ((params['pageno'] - 1) * number_of_results) + 1
|
||||
"""Assemble a Bing-Video request."""
|
||||
|
||||
search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset)
|
||||
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
||||
|
||||
# safesearch cookie
|
||||
params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
|
||||
SID = uuid.uuid1().hex.upper()
|
||||
set_bing_cookies(params, engine_language, engine_region, SID)
|
||||
|
||||
# language cookie
|
||||
language = match_language(params['language'], supported_languages, language_aliases).lower()
|
||||
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
|
||||
# build URL query
|
||||
#
|
||||
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
|
||||
|
||||
# query and paging
|
||||
params['url'] = base_url + search_path
|
||||
query_params = {
|
||||
# fmt: off
|
||||
'q': query,
|
||||
'async' : 'content',
|
||||
# to simplify the page count lets use the default of 35 images per page
|
||||
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
|
||||
'count' : 35,
|
||||
# fmt: on
|
||||
}
|
||||
|
||||
# time range
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
|
||||
#
|
||||
# example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
|
||||
|
||||
if params['time_range']:
|
||||
query_params['form'] = 'VRFLTR'
|
||||
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
|
||||
|
||||
params['url'] = base_url + '?' + urlencode(query_params)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
"""Get response from Bing-Video"""
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in dom.xpath('//div[@class="dg_u"]/div[contains(@class, "mc_vtvc")]'):
|
||||
metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
|
||||
for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
|
||||
metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
|
||||
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
|
||||
content = '{0} - {1}'.format(metadata['du'], info)
|
||||
thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid'])
|
||||
thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': metadata['murl'],
|
||||
|
@ -96,3 +116,13 @@ def response(resp):
|
|||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages and regions from Bing-Videos."""
|
||||
|
||||
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
|
||||
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
|
||||
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
|
||||
|
||||
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
|
||||
|
|
|
@ -1,17 +1,35 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Dailymotion (Videos)
|
||||
# lint: pylint
|
||||
"""
|
||||
Dailymotion (Videos)
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. _REST GET: https://developers.dailymotion.com/tools/
|
||||
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
|
||||
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
|
||||
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
|
||||
|
||||
"""
|
||||
|
||||
from typing import Set
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlencode
|
||||
import time
|
||||
import babel
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.network import raise_for_httperror
|
||||
from searx import network
|
||||
from searx.utils import html_to_text
|
||||
from searx.locales import region_tag, language_tag
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -37,11 +55,24 @@ time_delta_dict = {
|
|||
}
|
||||
|
||||
safesearch = True
|
||||
safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''}
|
||||
safesearch_params = {
|
||||
2: {'is_created_for_kids': 'true'},
|
||||
1: {'is_created_for_kids': 'true'},
|
||||
0: {},
|
||||
}
|
||||
"""True if this video is "Created for Kids" / intends to target an audience
|
||||
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
|
||||
"""
|
||||
|
||||
# search-url
|
||||
# - https://developers.dailymotion.com/tools/
|
||||
# - https://www.dailymotion.com/doc/api/obj-video.html
|
||||
family_filter_map = {
|
||||
2: 'true',
|
||||
1: 'true',
|
||||
0: 'false',
|
||||
}
|
||||
"""By default, the family filter is turned on. Setting this parameter to
|
||||
``false`` will stop filtering-out explicit content from searches and global
|
||||
contexts (``family_filter`` in `Global API Parameters`_ ).
|
||||
"""
|
||||
|
||||
result_fields = [
|
||||
'allow_embed',
|
||||
|
@ -53,27 +84,21 @@ result_fields = [
|
|||
'thumbnail_360_url',
|
||||
'id',
|
||||
]
|
||||
search_url = (
|
||||
'https://api.dailymotion.com/videos?'
|
||||
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
|
||||
).format(
|
||||
fields=','.join(result_fields),
|
||||
password_protected='false',
|
||||
private='false',
|
||||
sort='relevance',
|
||||
limit=number_of_results,
|
||||
)
|
||||
"""`Fields selection`_, by default, a few fields are returned. To request more
|
||||
specific fields, the ``fields`` parameter is used with the list of fields
|
||||
SearXNG needs in the response to build a video result list.
|
||||
"""
|
||||
|
||||
search_url = 'https://api.dailymotion.com/videos?'
|
||||
"""URL to retrieve a list of videos.
|
||||
|
||||
- `REST GET`_
|
||||
- `Global API Parameters`_
|
||||
- `Video filters API`_
|
||||
"""
|
||||
|
||||
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
|
||||
|
||||
# The request query filters by 'languages' & 'country', therefore instead of
|
||||
# fetching only languages we need to fetch locales.
|
||||
supported_languages_url = 'https://api.dailymotion.com/locales'
|
||||
supported_languages_iso639: Set[str] = set()
|
||||
|
||||
|
||||
def init(_engine_settings):
|
||||
global supported_languages_iso639
|
||||
supported_languages_iso639 = set([language.split('_')[0] for language in supported_languages])
|
||||
"""URL template to embed video in SearXNG's result list."""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
@ -81,34 +106,42 @@ def request(query, params):
|
|||
if not query:
|
||||
return False
|
||||
|
||||
language = params['language']
|
||||
if language == 'all':
|
||||
language = 'en-US'
|
||||
locale = babel.Locale.parse(language, sep='-')
|
||||
eng_region = traits.get_region(params['searxng_locale'], 'en_US')
|
||||
eng_lang = traits.get_language(params['searxng_locale'], 'en')
|
||||
|
||||
language_iso639 = locale.language
|
||||
if locale.language not in supported_languages_iso639:
|
||||
language_iso639 = 'en'
|
||||
|
||||
query_args = {
|
||||
args = {
|
||||
'search': query,
|
||||
'languages': language_iso639,
|
||||
'family_filter': family_filter_map.get(params['safesearch'], 'false'),
|
||||
'thumbnail_ratio': 'original', # original|widescreen|square
|
||||
# https://developers.dailymotion.com/api/#video-filters
|
||||
'languages': eng_lang,
|
||||
'page': params['pageno'],
|
||||
'password_protected': 'false',
|
||||
'private': 'false',
|
||||
'sort': 'relevance',
|
||||
'limit': number_of_results,
|
||||
'fields': ','.join(result_fields),
|
||||
}
|
||||
|
||||
if locale.territory:
|
||||
localization = locale.language + '_' + locale.territory
|
||||
if localization in supported_languages:
|
||||
query_args['country'] = locale.territory
|
||||
args.update(safesearch_params.get(params['safesearch'], {}))
|
||||
|
||||
# Don't add localization and country arguments if the user does select a
|
||||
# language (:de, :en, ..)
|
||||
|
||||
if len(params['searxng_locale'].split('-')) > 1:
|
||||
# https://developers.dailymotion.com/api/#global-parameters
|
||||
args['localization'] = eng_region
|
||||
args['country'] = eng_region.split('_')[1]
|
||||
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
|
||||
# 'ams_country': eng_region.split('_')[1],
|
||||
|
||||
time_delta = time_delta_dict.get(params["time_range"])
|
||||
if time_delta:
|
||||
created_after = datetime.now() - time_delta
|
||||
query_args['created_after'] = datetime.timestamp(created_after)
|
||||
args['created_after'] = datetime.timestamp(created_after)
|
||||
|
||||
query_str = urlencode(query_args)
|
||||
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '')
|
||||
params['raise_for_httperror'] = False
|
||||
query_str = urlencode(args)
|
||||
params['url'] = search_url + query_str
|
||||
|
||||
return params
|
||||
|
||||
|
@ -123,7 +156,7 @@ def response(resp):
|
|||
if 'error' in search_res:
|
||||
raise SearxEngineAPIException(search_res['error'].get('message'))
|
||||
|
||||
raise_for_httperror(resp)
|
||||
network.raise_for_httperror(resp)
|
||||
|
||||
# parse results
|
||||
for res in search_res.get('list', []):
|
||||
|
@ -167,7 +200,53 @@ def response(resp):
|
|||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def _fetch_supported_languages(resp):
|
||||
response_json = resp.json()
|
||||
return [item['locale'] for item in response_json['list']]
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch locales & languages from dailymotion.
|
||||
|
||||
Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
|
||||
There are duplications in the locale codes returned from Dailymotion which
|
||||
can be ignored::
|
||||
|
||||
en_EN --> en_GB, en_US
|
||||
ar_AA --> ar_EG, ar_AE, ar_SA
|
||||
|
||||
The language list `api/languages <https://api.dailymotion.com/languages>`_
|
||||
contains over 7000 *languages* codes (see PR1071_). We use only those
|
||||
language codes that are used in the locales.
|
||||
|
||||
.. _PR1071: https://github.com/searxng/searxng/pull/1071
|
||||
|
||||
"""
|
||||
|
||||
resp = network.get('https://api.dailymotion.com/locales')
|
||||
if not resp.ok:
|
||||
print("ERROR: response from dailymotion/locales is not OK.")
|
||||
|
||||
for item in resp.json()['list']:
|
||||
eng_tag = item['locale']
|
||||
if eng_tag in ('en_EN', 'ar_AA'):
|
||||
continue
|
||||
try:
|
||||
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: item unknown --> %s" % item)
|
||||
continue
|
||||
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = eng_tag
|
||||
|
||||
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
|
||||
|
||||
resp = network.get('https://api.dailymotion.com/languages')
|
||||
if not resp.ok:
|
||||
print("ERROR: response from dailymotion/languages is not OK.")
|
||||
|
||||
for item in resp.json()['list']:
|
||||
eng_tag = item['code']
|
||||
if eng_tag in locale_lang_list:
|
||||
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
|
||||
engine_traits.languages[sxng_tag] = eng_tag
|
||||
|
|
|
@ -63,7 +63,7 @@ def search(query, request_params):
|
|||
for row in result_list:
|
||||
entry = {
|
||||
'query': query,
|
||||
'language': request_params['language'],
|
||||
'language': request_params['searxng_locale'],
|
||||
'value': row.get("value"),
|
||||
# choose a result template or comment out to use the *default*
|
||||
'template': 'key-value.html',
|
||||
|
|
|
@ -1,71 +1,207 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""DuckDuckGo Lite
|
||||
"""
|
||||
DuckDuckGo Lite
|
||||
~~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
|
||||
from lxml.html import fromstring
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode
|
||||
import json
|
||||
import babel
|
||||
import lxml.html
|
||||
|
||||
from searx import (
|
||||
network,
|
||||
locales,
|
||||
redislib,
|
||||
)
|
||||
from searx import redisdb
|
||||
from searx.utils import (
|
||||
dict_subset,
|
||||
eval_xpath,
|
||||
eval_xpath_getindex,
|
||||
extract_text,
|
||||
match_language,
|
||||
)
|
||||
from searx.network import get
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://lite.duckduckgo.com/lite/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"official_api_documentation": 'https://duckduckgo.com/api',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
|
||||
``Accept-Language``. Optional the user can select a region filter (but not a
|
||||
language).
|
||||
"""
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
supported_languages_url = 'https://duckduckgo.com/util/u588.js'
|
||||
time_range_support = True
|
||||
send_accept_language_header = True
|
||||
safesearch = True # user can't select but the results are filtered
|
||||
|
||||
language_aliases = {
|
||||
'ar-SA': 'ar-XA',
|
||||
'es-419': 'es-XL',
|
||||
'ja': 'jp-JP',
|
||||
'ko': 'kr-KR',
|
||||
'sl-SI': 'sl-SL',
|
||||
'zh-TW': 'tzh-TW',
|
||||
'zh-HK': 'tzh-HK',
|
||||
}
|
||||
url = 'https://lite.duckduckgo.com/lite/'
|
||||
# url_ping = 'https://duckduckgo.com/t/sl_l'
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
|
||||
|
||||
# search-url
|
||||
url = 'https://lite.duckduckgo.com/lite/'
|
||||
url_ping = 'https://duckduckgo.com/t/sl_l'
|
||||
|
||||
# match query's language to a region code that duckduckgo will accept
|
||||
def get_region_code(lang, lang_list=None):
|
||||
if lang == 'all':
|
||||
return None
|
||||
def cache_vqd(query, value):
|
||||
"""Caches a ``vqd`` value from a query.
|
||||
|
||||
lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT')
|
||||
lang_parts = lang_code.split('-')
|
||||
The vqd value depends on the query string and is needed for the follow up
|
||||
pages or the images loaded by a XMLHttpRequest:
|
||||
|
||||
# country code goes first
|
||||
return lang_parts[1].lower() + '-' + lang_parts[0].lower()
|
||||
- DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
|
||||
- DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...`
|
||||
|
||||
"""
|
||||
c = redisdb.client()
|
||||
if c:
|
||||
logger.debug("cache vqd value: %s", value)
|
||||
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
|
||||
c.set(key, value, ex=600)
|
||||
|
||||
|
||||
def get_vqd(query, headers):
|
||||
"""Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
|
||||
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
|
||||
response.
|
||||
|
||||
"""
|
||||
value = None
|
||||
c = redisdb.client()
|
||||
if c:
|
||||
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
|
||||
value = c.get(key)
|
||||
if value:
|
||||
value = value.decode('utf-8')
|
||||
logger.debug("re-use cached vqd value: %s", value)
|
||||
return value
|
||||
|
||||
query_url = 'https://duckduckgo.com/?{query}&iar=images'.format(query=urlencode({'q': query}))
|
||||
res = network.get(query_url, headers=headers)
|
||||
content = res.text
|
||||
if content.find('vqd=\'') == -1:
|
||||
raise SearxEngineAPIException('Request failed')
|
||||
value = content[content.find('vqd=\'') + 5 :]
|
||||
value = value[: value.find('\'')]
|
||||
logger.debug("new vqd value: %s", value)
|
||||
cache_vqd(query, value)
|
||||
return value
|
||||
|
||||
|
||||
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
|
||||
"""Get DuckDuckGo's language identifier from SearXNG's locale.
|
||||
|
||||
DuckDuckGo defines its lanaguages by region codes (see
|
||||
:py:obj:`fetch_traits`).
|
||||
|
||||
To get region and language of a DDG service use:
|
||||
|
||||
.. code: python
|
||||
|
||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
|
||||
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
|
||||
the *region*:
|
||||
|
||||
.. code:: python
|
||||
|
||||
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
|
||||
params['cookies']['ad'] = eng_lang
|
||||
params['cookies']['ah'] = eng_region
|
||||
params['cookies']['l'] = eng_region
|
||||
|
||||
.. hint::
|
||||
|
||||
`DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
|
||||
selection to the user, only a region can be selected by the user
|
||||
(``eng_region`` from the example above). DDG-lite stores the selected
|
||||
region in a cookie::
|
||||
|
||||
params['cookies']['kl'] = eng_region # 'ar-es'
|
||||
|
||||
"""
|
||||
return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default))
|
||||
|
||||
|
||||
ddg_reg_map = {
|
||||
'tw-tzh': 'zh_TW',
|
||||
'hk-tzh': 'zh_HK',
|
||||
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
|
||||
'es-ca': 'ca_ES',
|
||||
'id-en': 'id_ID',
|
||||
'no-no': 'nb_NO',
|
||||
'jp-jp': 'ja_JP',
|
||||
'kr-kr': 'ko_KR',
|
||||
'xa-ar': 'ar_SA',
|
||||
'sl-sl': 'sl_SI',
|
||||
'th-en': 'th_TH',
|
||||
'vn-en': 'vi_VN',
|
||||
}
|
||||
|
||||
ddg_lang_map = {
|
||||
# use ar --> ar_EG (Egypt's arabic)
|
||||
"ar_DZ": 'lang_region',
|
||||
"ar_JO": 'lang_region',
|
||||
"ar_SA": 'lang_region',
|
||||
# use bn --> bn_BD
|
||||
'bn_IN': 'lang_region',
|
||||
# use de --> de_DE
|
||||
'de_CH': 'lang_region',
|
||||
# use en --> en_US,
|
||||
'en_AU': 'lang_region',
|
||||
'en_CA': 'lang_region',
|
||||
'en_GB': 'lang_region',
|
||||
# Esperanto
|
||||
'eo_XX': 'eo',
|
||||
# use es --> es_ES,
|
||||
'es_AR': 'lang_region',
|
||||
'es_CL': 'lang_region',
|
||||
'es_CO': 'lang_region',
|
||||
'es_CR': 'lang_region',
|
||||
'es_EC': 'lang_region',
|
||||
'es_MX': 'lang_region',
|
||||
'es_PE': 'lang_region',
|
||||
'es_UY': 'lang_region',
|
||||
'es_VE': 'lang_region',
|
||||
# use fr --> rf_FR
|
||||
'fr_CA': 'lang_region',
|
||||
'fr_CH': 'lang_region',
|
||||
'fr_BE': 'lang_region',
|
||||
# use nl --> nl_NL
|
||||
'nl_BE': 'lang_region',
|
||||
# use pt --> pt_PT
|
||||
'pt_BR': 'lang_region',
|
||||
# skip these languages
|
||||
'od_IN': 'skip',
|
||||
'io_XX': 'skip',
|
||||
'tokipona_XX': 'skip',
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
||||
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
|
||||
params['url'] = url
|
||||
params['method'] = 'POST'
|
||||
|
||||
params['data']['q'] = query
|
||||
|
||||
# The API is not documented, so we do some reverse engineering and emulate
|
||||
|
@ -88,23 +224,19 @@ def request(query, params):
|
|||
params['data']['s'] = offset
|
||||
params['data']['dc'] = offset + 1
|
||||
|
||||
# request needs a vqd argument
|
||||
params['data']['vqd'] = get_vqd(query, params["headers"])
|
||||
|
||||
# initial page does not have additional data in the input form
|
||||
if params['pageno'] > 1:
|
||||
# request the second page (and more pages) needs 'o' and 'api' arguments
|
||||
params['data']['o'] = 'json'
|
||||
params['data']['api'] = 'd.js'
|
||||
|
||||
# initial page does not have additional data in the input form
|
||||
if params['pageno'] > 2:
|
||||
# request the third page (and more pages) some more arguments
|
||||
params['data']['nextParams'] = ''
|
||||
params['data']['v'] = ''
|
||||
params['data']['vqd'] = ''
|
||||
params['data']['o'] = form_data.get('o', 'json')
|
||||
params['data']['api'] = form_data.get('api', 'd.js')
|
||||
params['data']['nextParams'] = form_data.get('nextParams', '')
|
||||
params['data']['v'] = form_data.get('v', 'l')
|
||||
|
||||
region_code = get_region_code(params['language'], supported_languages)
|
||||
if region_code:
|
||||
params['data']['kl'] = region_code
|
||||
params['cookies']['kl'] = region_code
|
||||
params['data']['kl'] = eng_region
|
||||
params['cookies']['kl'] = eng_region
|
||||
|
||||
params['data']['df'] = ''
|
||||
if params['time_range'] in time_range_dict:
|
||||
|
@ -116,26 +248,40 @@ def request(query, params):
|
|||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
|
||||
headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
|
||||
get(url_ping, headers=headers_ping)
|
||||
|
||||
if resp.status_code == 303:
|
||||
return []
|
||||
|
||||
results = []
|
||||
doc = fromstring(resp.text)
|
||||
doc = lxml.html.fromstring(resp.text)
|
||||
|
||||
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
|
||||
if not len(result_table) >= 3:
|
||||
|
||||
if len(result_table) == 2:
|
||||
# some locales (at least China) does not have a "next page" button and
|
||||
# the layout of the HTML tables is different.
|
||||
result_table = result_table[1]
|
||||
elif not len(result_table) >= 3:
|
||||
# no more results
|
||||
return []
|
||||
result_table = result_table[2]
|
||||
else:
|
||||
result_table = result_table[2]
|
||||
# update form data from response
|
||||
form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
|
||||
if len(form):
|
||||
|
||||
form = form[0]
|
||||
form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
|
||||
form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
|
||||
form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
|
||||
logger.debug('form_data: %s', form_data)
|
||||
|
||||
value = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
|
||||
query = resp.search_params['data']['q']
|
||||
cache_vqd(query, value)
|
||||
|
||||
tr_rows = eval_xpath(result_table, './/tr')
|
||||
|
||||
# In the last <tr> is the form of the 'previous/next page' links
|
||||
tr_rows = tr_rows[:-1]
|
||||
|
||||
|
@ -172,15 +318,105 @@ def response(resp):
|
|||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def _fetch_supported_languages(resp):
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages & regions from DuckDuckGo.
|
||||
|
||||
# response is a js file with regions as an embedded object
|
||||
response_page = resp.text
|
||||
response_page = response_page[response_page.find('regions:{') + 8 :]
|
||||
response_page = response_page[: response_page.find('}') + 1]
|
||||
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
|
||||
DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
|
||||
sense in a SearXNG request since SearXNG's ``all`` will not add a
|
||||
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
|
||||
is ``wt-wt`` (the region).
|
||||
|
||||
regions_json = loads(response_page)
|
||||
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
|
||||
Beside regions DuckDuckGo also defines its lanaguages by region codes. By
|
||||
example these are the english languages in DuckDuckGo:
|
||||
|
||||
return list(supported_languages)
|
||||
- en_US
|
||||
- en_AU
|
||||
- en_CA
|
||||
- en_GB
|
||||
|
||||
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
|
||||
SearXNG's locale.
|
||||
|
||||
"""
|
||||
# pylint: disable=too-many-branches, too-many-statements
|
||||
# fetch regions
|
||||
|
||||
engine_traits.all_locale = 'wt-wt'
|
||||
|
||||
# updated from u588 to u661 / should be updated automatically?
|
||||
resp = network.get('https://duckduckgo.com/util/u661.js')
|
||||
|
||||
if not resp.ok:
|
||||
print("ERROR: response from DuckDuckGo is not OK.")
|
||||
|
||||
pos = resp.text.find('regions:{') + 8
|
||||
js_code = resp.text[pos:]
|
||||
pos = js_code.find('}') + 1
|
||||
regions = json.loads(js_code[:pos])
|
||||
|
||||
for eng_tag, name in regions.items():
|
||||
|
||||
if eng_tag == 'wt-wt':
|
||||
engine_traits.all_locale = 'wt-wt'
|
||||
continue
|
||||
|
||||
region = ddg_reg_map.get(eng_tag)
|
||||
if region == 'skip':
|
||||
continue
|
||||
|
||||
if not region:
|
||||
eng_territory, eng_lang = eng_tag.split('-')
|
||||
region = eng_lang + '_' + eng_territory.upper()
|
||||
|
||||
try:
|
||||
sxng_tag = locales.region_tag(babel.Locale.parse(region))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
|
||||
continue
|
||||
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = eng_tag
|
||||
|
||||
# fetch languages
|
||||
|
||||
engine_traits.custom['lang_region'] = {}
|
||||
|
||||
pos = resp.text.find('languages:{') + 10
|
||||
js_code = resp.text[pos:]
|
||||
pos = js_code.find('}') + 1
|
||||
js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
|
||||
languages = json.loads(js_code)
|
||||
|
||||
for eng_lang, name in languages.items():
|
||||
|
||||
if eng_lang == 'wt_WT':
|
||||
continue
|
||||
|
||||
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
|
||||
if babel_tag == 'skip':
|
||||
continue
|
||||
|
||||
try:
|
||||
|
||||
if babel_tag == 'lang_region':
|
||||
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
|
||||
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
|
||||
continue
|
||||
|
||||
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
|
||||
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
|
||||
continue
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_lang:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
|
||||
continue
|
||||
engine_traits.languages[sxng_tag] = eng_lang
|
||||
|
|
|
@ -1,22 +1,33 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""DuckDuckGo (Instant Answer API)
|
||||
"""
|
||||
DuckDuckGo Instant Answer API
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
|
||||
reverse engineering we can see that some services (e.g. instant answers) still
|
||||
in use from the DDG search engine.
|
||||
|
||||
As far we can say the *instant answers* API does not support languages, or at
|
||||
least we could not find out how language support should work. It seems that
|
||||
most of the features are based on English terms.
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode, urlparse, urljoin
|
||||
from lxml import html
|
||||
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx.engines.duckduckgo import language_aliases
|
||||
from searx.engines.duckduckgo import ( # pylint: disable=unused-import
|
||||
_fetch_supported_languages,
|
||||
supported_languages_url,
|
||||
)
|
||||
from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
|
||||
from searx.utils import extract_text, html_to_text, get_string_replaces_function
|
||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
|
@ -37,7 +48,7 @@ replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
|
|||
|
||||
|
||||
def is_broken_text(text):
|
||||
"""duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
|
||||
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
|
||||
|
||||
The href URL is broken, the "Related website" may contains some HTML.
|
||||
|
||||
|
@ -62,8 +73,6 @@ def result_to_text(text, htmlResult):
|
|||
|
||||
def request(query, params):
|
||||
params['url'] = URL.format(query=urlencode({'q': query}))
|
||||
language = match_language(params['language'], supported_languages, language_aliases)
|
||||
language = language.split('-')[0]
|
||||
return params
|
||||
|
||||
|
||||
|
@ -71,7 +80,7 @@ def response(resp):
|
|||
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
|
||||
results = []
|
||||
|
||||
search_res = json.loads(resp.text)
|
||||
search_res = resp.json()
|
||||
|
||||
# search_res.get('Entity') possible values (not exhaustive) :
|
||||
# * continent / country / department / location / waterfall
|
||||
|
@ -235,7 +244,7 @@ def unit_to_str(unit):
|
|||
|
||||
|
||||
def area_to_str(area):
|
||||
"""parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
|
||||
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
|
||||
unit = unit_to_str(area.get('unit'))
|
||||
if unit is not None:
|
||||
try:
|
||||
|
|
|
@ -1,26 +1,30 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
DuckDuckGo (Images)
|
||||
DuckDuckGo Images
|
||||
~~~~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.engines.duckduckgo import get_region_code
|
||||
from searx.engines.duckduckgo import ( # pylint: disable=unused-import
|
||||
_fetch_supported_languages,
|
||||
supported_languages_url,
|
||||
|
||||
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.duckduckgo import (
|
||||
get_ddg_lang,
|
||||
get_vqd,
|
||||
)
|
||||
from searx.network import get
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"official_api_documentation": {
|
||||
'url': 'https://duckduckgo.com/api',
|
||||
'comment': 'but images are not supported',
|
||||
},
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON (site requires js to get images)',
|
||||
|
@ -32,70 +36,64 @@ paging = True
|
|||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
||||
# search-url
|
||||
images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
|
||||
site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
|
||||
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
|
||||
safesearch_args = {0: '1', 1: None, 2: '1'}
|
||||
|
||||
|
||||
# run query in site to get vqd number needed for requesting images
|
||||
# TODO: find a way to get this number without an extra request (is it a hash of the query?)
|
||||
def get_vqd(query, headers):
|
||||
query_url = site_url.format(query=urlencode({'q': query}))
|
||||
res = get(query_url, headers=headers)
|
||||
content = res.text
|
||||
if content.find('vqd=\'') == -1:
|
||||
raise SearxEngineAPIException('Request failed')
|
||||
vqd = content[content.find('vqd=\'') + 5 :]
|
||||
vqd = vqd[: vqd.find('\'')]
|
||||
return vqd
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
# to avoid running actual external requests when testing
|
||||
if 'is_test' not in params:
|
||||
vqd = get_vqd(query, params['headers'])
|
||||
else:
|
||||
vqd = '12345'
|
||||
|
||||
offset = (params['pageno'] - 1) * 50
|
||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
|
||||
safesearch = params['safesearch'] - 1
|
||||
args = {
|
||||
'q': query,
|
||||
'o': 'json',
|
||||
# 'u': 'bing',
|
||||
'l': eng_region,
|
||||
'vqd': get_vqd(query, params["headers"]),
|
||||
}
|
||||
|
||||
region_code = get_region_code(params['language'], lang_list=supported_languages)
|
||||
if region_code:
|
||||
params['url'] = images_url.format(
|
||||
query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd
|
||||
)
|
||||
else:
|
||||
params['url'] = images_url.format(query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd)
|
||||
if params['pageno'] > 1:
|
||||
args['s'] = (params['pageno'] - 1) * 100
|
||||
|
||||
params['cookies']['ad'] = eng_lang # zh_CN
|
||||
params['cookies']['ah'] = eng_region # "us-en,de-de"
|
||||
params['cookies']['l'] = eng_region # "hk-tzh"
|
||||
logger.debug("cookies: %s", params['cookies'])
|
||||
|
||||
safe_search = safesearch_cookies.get(params['safesearch'])
|
||||
if safe_search is not None:
|
||||
params['cookies']['p'] = safe_search # "-2", "1"
|
||||
safe_search = safesearch_args.get(params['safesearch'])
|
||||
if safe_search is not None:
|
||||
args['p'] = safe_search # "-1", "1"
|
||||
|
||||
args = urlencode(args)
|
||||
params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,')
|
||||
|
||||
params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01'
|
||||
params['headers']['Referer'] = 'https://duckduckgo.com/'
|
||||
params['headers']['X-Requested-With'] = 'XMLHttpRequest'
|
||||
logger.debug("headers: %s", params['headers'])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
res_json = resp.json()
|
||||
|
||||
content = resp.text
|
||||
res_json = loads(content)
|
||||
|
||||
# parse results
|
||||
for result in res_json['results']:
|
||||
title = result['title']
|
||||
url = result['url']
|
||||
thumbnail = result['thumbnail']
|
||||
image = result['image']
|
||||
|
||||
# append result
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'title': title,
|
||||
'title': result['title'],
|
||||
'content': '',
|
||||
'thumbnail_src': thumbnail,
|
||||
'img_src': image,
|
||||
'url': url,
|
||||
'thumbnail_src': result['thumbnail'],
|
||||
'img_src': result['image'],
|
||||
'url': result['url'],
|
||||
'img_format': '%s x %s' % (result['width'], result['height']),
|
||||
'source': result['source'],
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
@ -1,13 +1,29 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""DuckDuckGo Weather"""
|
||||
"""
|
||||
DuckDuckGo Weather
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from json import loads
|
||||
from urllib.parse import quote
|
||||
|
||||
from datetime import datetime
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.duckduckgo import get_ddg_lang
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
|
@ -17,9 +33,11 @@ about = {
|
|||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = ["others"]
|
||||
send_accept_language_header = True
|
||||
|
||||
url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
|
||||
# engine dependent config
|
||||
categories = ["others"]
|
||||
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
|
||||
|
||||
|
||||
def generate_condition_table(condition):
|
||||
|
@ -72,8 +90,17 @@ def generate_day_table(day):
|
|||
|
||||
|
||||
def request(query, params):
|
||||
params["url"] = url.format(query=quote(query), lang=params['language'].split('-')[0])
|
||||
|
||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
|
||||
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
|
||||
params['cookies']['ad'] = eng_lang
|
||||
params['cookies']['ah'] = eng_region
|
||||
params['cookies']['l'] = eng_region
|
||||
logger.debug("cookies: %s", params['cookies'])
|
||||
|
||||
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
|
||||
return params
|
||||
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ base_url = 'https://wiki.gentoo.org'
|
|||
# xpath queries
|
||||
xpath_results = '//ul[@class="mw-search-results"]/li'
|
||||
xpath_link = './/div[@class="mw-search-result-heading"]/a'
|
||||
xpath_content = './/div[@class="searchresult"]'
|
||||
|
||||
|
||||
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
|
||||
|
@ -77,8 +78,6 @@ main_langs = {
|
|||
'uk': 'Українська',
|
||||
'zh': '简体中文',
|
||||
}
|
||||
supported_languages = dict(lang_urls, **main_langs)
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
|
@ -118,7 +117,8 @@ def response(resp):
|
|||
link = result.xpath(xpath_link)[0]
|
||||
href = urljoin(base_url, link.attrib.get('href'))
|
||||
title = extract_text(link)
|
||||
content = extract_text(result.xpath(xpath_content))
|
||||
|
||||
results.append({'url': href, 'title': title})
|
||||
results.append({'url': href, 'title': title, 'content': content})
|
||||
|
||||
return results
|
||||
|
|
|
@ -1,34 +1,39 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""This is the implementation of the google WEB engine. Some of this
|
||||
implementations are shared by other engines:
|
||||
"""This is the implementation of the Google WEB engine. Some of this
|
||||
implementations (manly the :py:obj:`get_google_info`) are shared by other
|
||||
engines:
|
||||
|
||||
- :ref:`google images engine`
|
||||
- :ref:`google news engine`
|
||||
- :ref:`google videos engine`
|
||||
|
||||
The google WEB engine itself has a special setup option:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: google
|
||||
...
|
||||
use_mobile_ui: false
|
||||
|
||||
``use_mobile_ui``: (default: ``false``)
|
||||
Enables to use *mobile endpoint* to bypass the google blocking (see
|
||||
:issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More
|
||||
results` is not affected by Google rate limiting and we can still do requests
|
||||
while actively blocked by the original Google search. By activate
|
||||
``use_mobile_ui`` this behavior is simulated by adding the parameter
|
||||
``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.
|
||||
- :ref:`google scholar engine`
|
||||
- :ref:`google autocomplete`
|
||||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import re
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
import babel
|
||||
import babel.core
|
||||
import babel.languages
|
||||
|
||||
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
from searx.locales import language_tag, region_tag, get_offical_locales
|
||||
from searx import network
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -45,64 +50,6 @@ categories = ['general', 'web']
|
|||
paging = True
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
use_mobile_ui = False
|
||||
supported_languages_url = 'https://www.google.com/preferences?#languages'
|
||||
|
||||
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
|
||||
google_domains = {
|
||||
'BG': 'google.bg', # Bulgaria
|
||||
'CZ': 'google.cz', # Czech Republic
|
||||
'DE': 'google.de', # Germany
|
||||
'DK': 'google.dk', # Denmark
|
||||
'AT': 'google.at', # Austria
|
||||
'CH': 'google.ch', # Switzerland
|
||||
'GR': 'google.gr', # Greece
|
||||
'AU': 'google.com.au', # Australia
|
||||
'CA': 'google.ca', # Canada
|
||||
'GB': 'google.co.uk', # United Kingdom
|
||||
'ID': 'google.co.id', # Indonesia
|
||||
'IE': 'google.ie', # Ireland
|
||||
'IN': 'google.co.in', # India
|
||||
'MY': 'google.com.my', # Malaysia
|
||||
'NZ': 'google.co.nz', # New Zealand
|
||||
'PH': 'google.com.ph', # Philippines
|
||||
'SG': 'google.com.sg', # Singapore
|
||||
'US': 'google.com', # United States (google.us) redirects to .com
|
||||
'ZA': 'google.co.za', # South Africa
|
||||
'AR': 'google.com.ar', # Argentina
|
||||
'CL': 'google.cl', # Chile
|
||||
'ES': 'google.es', # Spain
|
||||
'MX': 'google.com.mx', # Mexico
|
||||
'EE': 'google.ee', # Estonia
|
||||
'FI': 'google.fi', # Finland
|
||||
'BE': 'google.be', # Belgium
|
||||
'FR': 'google.fr', # France
|
||||
'IL': 'google.co.il', # Israel
|
||||
'HR': 'google.hr', # Croatia
|
||||
'HU': 'google.hu', # Hungary
|
||||
'IT': 'google.it', # Italy
|
||||
'JP': 'google.co.jp', # Japan
|
||||
'KR': 'google.co.kr', # South Korea
|
||||
'LT': 'google.lt', # Lithuania
|
||||
'LV': 'google.lv', # Latvia
|
||||
'NO': 'google.no', # Norway
|
||||
'NL': 'google.nl', # Netherlands
|
||||
'PL': 'google.pl', # Poland
|
||||
'BR': 'google.com.br', # Brazil
|
||||
'PT': 'google.pt', # Portugal
|
||||
'RO': 'google.ro', # Romania
|
||||
'RU': 'google.ru', # Russia
|
||||
'SK': 'google.sk', # Slovakia
|
||||
'SI': 'google.si', # Slovenia
|
||||
'SE': 'google.se', # Sweden
|
||||
'TH': 'google.co.th', # Thailand
|
||||
'TR': 'google.com.tr', # Turkey
|
||||
'UA': 'google.com.ua', # Ukraine
|
||||
'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN
|
||||
'HK': 'google.com.hk', # Hong Kong
|
||||
'TW': 'google.com.tw', # Taiwan
|
||||
}
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||
|
||||
|
@ -112,50 +59,50 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
|
|||
# specific xpath variables
|
||||
# ------------------------
|
||||
|
||||
results_xpath = './/div[@data-sokoban-container]'
|
||||
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
|
||||
title_xpath = './/a/h3[1]'
|
||||
href_xpath = './/a[h3]/@href'
|
||||
content_xpath = './/div[@data-content-feature=1]'
|
||||
|
||||
# google *sections* are no usual *results*, we ignore them
|
||||
g_section_with_header = './g-section-with-header'
|
||||
|
||||
content_xpath = './/div[@data-sncf]'
|
||||
|
||||
# Suggestions are links placed in a *card-section*, we extract only the text
|
||||
# from the links not the links itself.
|
||||
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
|
||||
|
||||
# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
|
||||
# # celebrities like '!google natasha allegri'
|
||||
# # or '!google chris evans'
|
||||
UI_ASYNC = 'use_ac:true,_fmt:prog'
|
||||
"""Format of the response from UI's async request."""
|
||||
|
||||
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
||||
"""Composing various language properties for the google engines.
|
||||
|
||||
def get_google_info(params, eng_traits):
|
||||
"""Composing various (language) properties for the google engines (:ref:`google
|
||||
API`).
|
||||
|
||||
This function is called by the various google engines (:ref:`google web
|
||||
engine`, :ref:`google images engine`, :ref:`google news engine` and
|
||||
:ref:`google videos engine`).
|
||||
|
||||
:param dict param: request parameters of the engine
|
||||
:param dict param: Request parameters of the engine. At least
|
||||
a ``searxng_locale`` key should be in the dictionary.
|
||||
|
||||
:param list lang_list: list of supported languages of the engine
|
||||
:py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`
|
||||
|
||||
:param dict lang_list: custom aliases for non standard language codes
|
||||
(used when calling :py:func:`searx.utils.match_language`)
|
||||
|
||||
:param bool supported_any_language: When a language is not specified, the
|
||||
language interpretation is left up to Google to decide how the search
|
||||
results should be delivered. This argument is ``True`` for the google
|
||||
engine and ``False`` for the other engines (google-images, -news,
|
||||
-scholar, -videos).
|
||||
:param eng_traits: Engine's traits fetched from google preferences
|
||||
(:py:obj:`searx.enginelib.traits.EngineTraits`)
|
||||
|
||||
:rtype: dict
|
||||
:returns:
|
||||
Py-Dictionary with the key/value pairs:
|
||||
|
||||
language:
|
||||
Return value from :py:func:`searx.utils.match_language`
|
||||
The language code that is used by google (e.g. ``lang_en`` or
|
||||
``lang_zh-TW``)
|
||||
|
||||
country:
|
||||
The country code (e.g. US, AT, CA, FR, DE ..)
|
||||
The country code that is used by google (e.g. ``US`` or ``TW``)
|
||||
|
||||
locale:
|
||||
A instance of :py:obj:`babel.core.Locale` build from the
|
||||
``searxng_locale`` value.
|
||||
|
||||
subdomain:
|
||||
Google subdomain :py:obj:`google_domains` that fits to the country
|
||||
|
@ -165,52 +112,67 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
|||
Py-Dictionary with additional request arguments (can be passed to
|
||||
:py:func:`urllib.parse.urlencode`).
|
||||
|
||||
- ``hl`` parameter: specifies the interface language of user interface.
|
||||
- ``lr`` parameter: restricts search results to documents written in
|
||||
a particular language.
|
||||
- ``cr`` parameter: restricts search results to documents
|
||||
originating in a particular country.
|
||||
- ``ie`` parameter: sets the character encoding scheme that should
|
||||
be used to interpret the query string ('utf8').
|
||||
- ``oe`` parameter: sets the character encoding scheme that should
|
||||
be used to decode the XML result ('utf8').
|
||||
|
||||
headers:
|
||||
Py-Dictionary with additional HTTP headers (can be passed to
|
||||
request's headers)
|
||||
|
||||
- ``Accept: '*/*``
|
||||
|
||||
"""
|
||||
|
||||
ret_val = {
|
||||
'language': None,
|
||||
'country': None,
|
||||
'subdomain': None,
|
||||
'params': {},
|
||||
'headers': {},
|
||||
'cookies': {},
|
||||
'locale': None,
|
||||
}
|
||||
|
||||
# language ...
|
||||
sxng_locale = params.get('searxng_locale', 'all')
|
||||
try:
|
||||
locale = babel.Locale.parse(sxng_locale, sep='-')
|
||||
except babel.core.UnknownLocaleError:
|
||||
locale = None
|
||||
|
||||
_lang = params['language']
|
||||
_any_language = _lang.lower() == 'all'
|
||||
if _any_language:
|
||||
_lang = 'en-US'
|
||||
language = match_language(_lang, lang_list, custom_aliases)
|
||||
ret_val['language'] = language
|
||||
eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
|
||||
lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
|
||||
country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
|
||||
|
||||
# country ...
|
||||
# Test zh_hans & zh_hant --> in the topmost links in the result list of list
|
||||
# TW and HK you should a find wiktionary.org zh_hant link. In the result
|
||||
# list of zh-CN should not be no hant link instead you should find
|
||||
# zh.m.wikipedia.org/zh somewhere in the top.
|
||||
|
||||
_l = _lang.split('-')
|
||||
if len(_l) == 2:
|
||||
country = _l[1]
|
||||
else:
|
||||
country = _l[0].upper()
|
||||
if country == 'EN':
|
||||
country = 'US'
|
||||
# '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
|
||||
# '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
|
||||
|
||||
ret_val['language'] = eng_lang
|
||||
ret_val['country'] = country
|
||||
|
||||
# subdomain ...
|
||||
|
||||
ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
|
||||
|
||||
# params & headers
|
||||
|
||||
lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..)
|
||||
ret_val['locale'] = locale
|
||||
ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
|
||||
|
||||
# hl parameter:
|
||||
# https://developers.google.com/custom-search/docs/xml_results#hlsp The
|
||||
# Interface Language:
|
||||
# The hl parameter specifies the interface language (host language) of
|
||||
# your user interface. To improve the performance and the quality of your
|
||||
# search results, you are strongly encouraged to set this parameter
|
||||
# explicitly.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#hlsp
|
||||
# The Interface Language:
|
||||
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
|
||||
|
||||
ret_val['params']['hl'] = lang_list.get(lang_country, language)
|
||||
ret_val['params']['hl'] = lang_code
|
||||
|
||||
# lr parameter:
|
||||
# The lr (language restrict) parameter restricts search results to
|
||||
|
@ -218,22 +180,72 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
|||
# https://developers.google.com/custom-search/docs/xml_results#lrsp
|
||||
# Language Collection Values:
|
||||
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
|
||||
#
|
||||
# To select 'all' languages an empty 'lr' value is used.
|
||||
#
|
||||
# Different to other google services, Google Schloar supports to select more
|
||||
# than one language. The languages are seperated by a pipe '|' (logical OR).
|
||||
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
|
||||
# traditional chinese OR german language.
|
||||
|
||||
if _any_language and supported_any_language:
|
||||
ret_val['params']['lr'] = eng_lang
|
||||
if sxng_locale == 'all':
|
||||
ret_val['params']['lr'] = ''
|
||||
|
||||
# interpretation is left up to Google (based on whoogle)
|
||||
#
|
||||
# - add parameter ``source=lnt``
|
||||
# - don't use parameter ``lr``
|
||||
# - don't add a ``Accept-Language`` HTTP header.
|
||||
# cr parameter:
|
||||
# The cr parameter restricts search results to documents originating in a
|
||||
# particular country.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#crsp
|
||||
|
||||
ret_val['params']['source'] = 'lnt'
|
||||
ret_val['params']['cr'] = 'country' + country
|
||||
if sxng_locale == 'all':
|
||||
ret_val['params']['cr'] = ''
|
||||
|
||||
else:
|
||||
# gl parameter: (mandatory by Geeogle News)
|
||||
# The gl parameter value is a two-letter country code. For WebSearch
|
||||
# results, the gl parameter boosts search results whose country of origin
|
||||
# matches the parameter value. See the Country Codes section for a list of
|
||||
# valid values.
|
||||
# Specifying a gl parameter value in WebSearch requests should improve the
|
||||
# relevance of results. This is particularly true for international
|
||||
# customers and, even more specifically, for customers in English-speaking
|
||||
# countries other than the United States.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#glsp
|
||||
|
||||
# restricts search results to documents written in a particular
|
||||
# language.
|
||||
ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
|
||||
ret_val['params']['gl'] = country
|
||||
|
||||
# ie parameter:
|
||||
# The ie parameter sets the character encoding scheme that should be used
|
||||
# to interpret the query string. The default ie value is latin1.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#iesp
|
||||
|
||||
ret_val['params']['ie'] = 'utf8'
|
||||
|
||||
# oe parameter:
|
||||
# The oe parameter sets the character encoding scheme that should be used
|
||||
# to decode the XML result. The default oe value is latin1.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#oesp
|
||||
|
||||
ret_val['params']['oe'] = 'utf8'
|
||||
|
||||
# num parameter:
|
||||
# The num parameter identifies the number of search results to return.
|
||||
# The default num value is 10, and the maximum value is 20. If you request
|
||||
# more than 20 results, only 20 results will be returned.
|
||||
# https://developers.google.com/custom-search/docs/xml_results#numsp
|
||||
|
||||
# HINT: seems to have no effect (tested in google WEB & Images)
|
||||
# ret_val['params']['num'] = 20
|
||||
|
||||
# HTTP headers
|
||||
|
||||
ret_val['headers']['Accept'] = '*/*'
|
||||
|
||||
# Cookies
|
||||
|
||||
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
|
||||
# - https://github.com/searxng/searxng/issues/1555
|
||||
ret_val['cookies']['CONSENT'] = "YES+"
|
||||
|
||||
return ret_val
|
||||
|
||||
|
@ -245,33 +257,34 @@ def detect_google_sorry(resp):
|
|||
|
||||
def request(query, params):
|
||||
"""Google search request"""
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
offset = (params['pageno'] - 1) * 10
|
||||
|
||||
lang_info = get_lang_info(params, supported_languages, language_aliases, True)
|
||||
|
||||
additional_parameters = {}
|
||||
if use_mobile_ui:
|
||||
additional_parameters = {
|
||||
'asearch': 'arc',
|
||||
'async': 'use_ac:true,_fmt:prog',
|
||||
}
|
||||
google_info = get_google_info(params, traits)
|
||||
|
||||
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
|
||||
query_url = (
|
||||
'https://'
|
||||
+ lang_info['subdomain']
|
||||
+ google_info['subdomain']
|
||||
+ '/search'
|
||||
+ "?"
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
**lang_info['params'],
|
||||
'ie': "utf8",
|
||||
'oe': "utf8",
|
||||
'start': offset,
|
||||
**google_info['params'],
|
||||
'filter': '0',
|
||||
**additional_parameters,
|
||||
'start': offset,
|
||||
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
|
||||
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
|
||||
# 'cs' : 1,
|
||||
# 'sa': 'N',
|
||||
# 'yv': 3,
|
||||
# 'prmd': 'vin',
|
||||
# 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
|
||||
# 'sa': 'N',
|
||||
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
|
||||
# formally known as use_mobile_ui
|
||||
'asearch': 'arc',
|
||||
'async': UI_ASYNC,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
@ -282,25 +295,38 @@ def request(query, params):
|
|||
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
||||
params['url'] = query_url
|
||||
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
if use_mobile_ui:
|
||||
params['headers']['Accept'] = '*/*'
|
||||
else:
|
||||
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
|
||||
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
|
||||
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
|
||||
|
||||
|
||||
def _parse_data_images(dom):
|
||||
data_image_map = {}
|
||||
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
|
||||
end_pos = data_image.rfind('=')
|
||||
if end_pos > 0:
|
||||
data_image = data_image[: end_pos + 1]
|
||||
data_image_map[img_id] = data_image
|
||||
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
|
||||
return data_image_map
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from google's search request"""
|
||||
|
||||
# pylint: disable=too-many-branches, too-many-statements
|
||||
detect_google_sorry(resp)
|
||||
|
||||
results = []
|
||||
|
||||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
data_image_map = _parse_data_images(dom)
|
||||
|
||||
# results --> answer
|
||||
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
|
||||
if answer_list:
|
||||
|
@ -309,25 +335,9 @@ def response(resp):
|
|||
else:
|
||||
logger.debug("did not find 'answer'")
|
||||
|
||||
# results --> number_of_results
|
||||
if not use_mobile_ui:
|
||||
try:
|
||||
_txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0)
|
||||
_digit = ''.join([n for n in _txt if n.isdigit()])
|
||||
number_of_results = int(_digit)
|
||||
results.append({'number_of_results': number_of_results})
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
logger.debug("did not 'number_of_results'")
|
||||
logger.error(e, exc_info=True)
|
||||
|
||||
# parse results
|
||||
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
|
||||
# google *sections*
|
||||
if extract_text(eval_xpath(result, g_section_with_header)):
|
||||
logger.debug("ignoring <g-section-with-header>")
|
||||
continue
|
||||
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
|
||||
|
||||
try:
|
||||
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
|
||||
|
@ -336,16 +346,30 @@ def response(resp):
|
|||
logger.debug('ignoring item from the result_xpath list: missing title')
|
||||
continue
|
||||
title = extract_text(title_tag)
|
||||
|
||||
url = eval_xpath_getindex(result, href_xpath, 0, None)
|
||||
if url is None:
|
||||
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
|
||||
continue
|
||||
content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
|
||||
if content is None:
|
||||
|
||||
content_nodes = eval_xpath(result, content_xpath)
|
||||
content = extract_text(content_nodes)
|
||||
|
||||
if not content:
|
||||
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
|
||||
continue
|
||||
|
||||
logger.debug('add link to results: %s', title)
|
||||
results.append({'url': url, 'title': title, 'content': content})
|
||||
img_src = content_nodes[0].xpath('.//img/@src')
|
||||
if img_src:
|
||||
img_src = img_src[0]
|
||||
if img_src.startswith('data:image'):
|
||||
img_id = content_nodes[0].xpath('.//img/@id')
|
||||
if img_id:
|
||||
img_src = data_image_map.get(img_id[0])
|
||||
else:
|
||||
img_src = None
|
||||
|
||||
results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
|
||||
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
logger.error(e, exc_info=True)
|
||||
|
@ -361,15 +385,107 @@ def response(resp):
|
|||
|
||||
|
||||
# get supported languages from their site
|
||||
def _fetch_supported_languages(resp):
|
||||
ret_val = {}
|
||||
|
||||
|
||||
skip_countries = [
|
||||
# official language of google-country not in google-languages
|
||||
'AL', # Albanien (sq)
|
||||
'AZ', # Aserbaidschan (az)
|
||||
'BD', # Bangladesch (bn)
|
||||
'BN', # Brunei Darussalam (ms)
|
||||
'BT', # Bhutan (dz)
|
||||
'ET', # Äthiopien (am)
|
||||
'GE', # Georgien (ka, os)
|
||||
'GL', # Grönland (kl)
|
||||
'KH', # Kambodscha (km)
|
||||
'LA', # Laos (lo)
|
||||
'LK', # Sri Lanka (si, ta)
|
||||
'ME', # Montenegro (sr)
|
||||
'MK', # Nordmazedonien (mk, sq)
|
||||
'MM', # Myanmar (my)
|
||||
'MN', # Mongolei (mn)
|
||||
'MV', # Malediven (dv) // dv_MV is unknown by babel
|
||||
'MY', # Malaysia (ms)
|
||||
'NP', # Nepal (ne)
|
||||
'TJ', # Tadschikistan (tg)
|
||||
'TM', # Turkmenistan (tk)
|
||||
'UZ', # Usbekistan (uz)
|
||||
]
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
|
||||
"""Fetch languages from Google."""
|
||||
# pylint: disable=import-outside-toplevel, too-many-branches
|
||||
|
||||
engine_traits.custom['supported_domains'] = {}
|
||||
|
||||
resp = network.get('https://www.google.com/preferences')
|
||||
if not resp.ok:
|
||||
raise RuntimeError("Response from Google's preferences is not OK.")
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]')
|
||||
# supported language codes
|
||||
|
||||
for x in radio_buttons:
|
||||
name = x.get("data-name")
|
||||
code = x.get("value").split('_')[-1]
|
||||
ret_val[code] = {"name": name}
|
||||
lang_map = {'no': 'nb'}
|
||||
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
|
||||
|
||||
return ret_val
|
||||
eng_lang = x.get("value").split('_')[-1]
|
||||
try:
|
||||
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
|
||||
continue
|
||||
sxng_lang = language_tag(locale)
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_lang)
|
||||
if conflict:
|
||||
if conflict != eng_lang:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
|
||||
continue
|
||||
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
|
||||
|
||||
# alias languages
|
||||
engine_traits.languages['zh'] = 'lang_zh-CN'
|
||||
|
||||
# supported region codes
|
||||
|
||||
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
|
||||
eng_country = x.get("value")
|
||||
|
||||
if eng_country in skip_countries:
|
||||
continue
|
||||
if eng_country == 'ZZ':
|
||||
engine_traits.all_locale = 'ZZ'
|
||||
continue
|
||||
|
||||
sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
|
||||
|
||||
if not sxng_locales:
|
||||
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
|
||||
continue
|
||||
|
||||
for sxng_locale in sxng_locales:
|
||||
engine_traits.regions[region_tag(sxng_locale)] = eng_country
|
||||
|
||||
# alias regions
|
||||
engine_traits.regions['zh-CN'] = 'HK'
|
||||
|
||||
# supported domains
|
||||
|
||||
if add_domains:
|
||||
resp = network.get('https://www.google.com/supported_domains')
|
||||
if not resp.ok:
|
||||
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
|
||||
|
||||
for domain in resp.text.split():
|
||||
domain = domain.strip()
|
||||
if not domain or domain in [
|
||||
'.google.com',
|
||||
]:
|
||||
continue
|
||||
region = domain.split('.')[-1].upper()
|
||||
engine_traits.custom['supported_domains'][region] = 'www' + domain
|
||||
if region == 'HK':
|
||||
# There is no google.cn, we use .com.hk for zh-CN
|
||||
engine_traits.custom['supported_domains']['CN'] = 'www' + domain
|
||||
|
|
|
@ -1,31 +1,38 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""This is the implementation of the google images engine using the google
|
||||
internal API used the Google Go Android app.
|
||||
"""This is the implementation of the Google Images engine using the internal
|
||||
Google API used by the Google Go Android app.
|
||||
|
||||
This internal API offer results in
|
||||
|
||||
- JSON (_fmt:json)
|
||||
- Protobuf (_fmt:pb)
|
||||
- Protobuf compressed? (_fmt:pc)
|
||||
- HTML (_fmt:html)
|
||||
- Protobuf encoded in JSON (_fmt:jspb).
|
||||
- JSON (``_fmt:json``)
|
||||
- Protobuf_ (``_fmt:pb``)
|
||||
- Protobuf_ compressed? (``_fmt:pc``)
|
||||
- HTML (``_fmt:html``)
|
||||
- Protobuf_ encoded in JSON (``_fmt:jspb``).
|
||||
|
||||
.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from json import loads
|
||||
|
||||
from searx.engines.google import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
get_lang_info,
|
||||
get_google_info,
|
||||
time_range_dict,
|
||||
detect_google_sorry,
|
||||
)
|
||||
|
||||
# pylint: disable=unused-import
|
||||
from searx.engines.google import supported_languages_url, _fetch_supported_languages
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
logger: logging.Logger
|
||||
traits: EngineTraits
|
||||
|
||||
# pylint: enable=unused-import
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -40,7 +47,6 @@ about = {
|
|||
# engine dependent config
|
||||
categories = ['images', 'web']
|
||||
paging = True
|
||||
use_locale_domain = True
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
@ -51,20 +57,18 @@ filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
|
|||
def request(query, params):
|
||||
"""Google-Image search request"""
|
||||
|
||||
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
|
||||
google_info = get_google_info(params, traits)
|
||||
|
||||
query_url = (
|
||||
'https://'
|
||||
+ lang_info['subdomain']
|
||||
+ google_info['subdomain']
|
||||
+ '/search'
|
||||
+ "?"
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
'tbm': "isch",
|
||||
**lang_info['params'],
|
||||
'ie': "utf8",
|
||||
'oe': "utf8",
|
||||
**google_info['params'],
|
||||
'asearch': 'isch',
|
||||
'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
|
||||
}
|
||||
|
@ -77,9 +81,8 @@ def request(query, params):
|
|||
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
||||
params['url'] = query_url
|
||||
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
|
||||
params['headers']['Accept'] = '*/*'
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
|
@ -111,7 +114,11 @@ def response(resp):
|
|||
|
||||
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
|
||||
if copyright_notice:
|
||||
result_item['source'] += ' / ' + copyright_notice
|
||||
result_item['source'] += ' | ' + copyright_notice
|
||||
|
||||
freshness_date = item["result"].get("freshness_date")
|
||||
if freshness_date:
|
||||
result_item['source'] += ' | ' + freshness_date
|
||||
|
||||
file_size = item.get('gsa', {}).get('file_size')
|
||||
if file_size:
|
||||
|
|
|
@ -1,24 +1,40 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""This is the implementation of the google news engine. The google news API
|
||||
ignores some parameters from the common :ref:`google API`:
|
||||
"""This is the implementation of the Google News engine.
|
||||
|
||||
- num_ : the number of search results is ignored
|
||||
Google News has a different region handling compared to Google WEB.
|
||||
|
||||
- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
|
||||
- the hl_ argument has to be set correctly (and different to Google WEB)
|
||||
- the gl_ argument is mandatory
|
||||
|
||||
If one of this argument is not set correctly, the request is redirected to
|
||||
CONSENT dialog::
|
||||
|
||||
https://consent.google.com/m?continue=
|
||||
|
||||
The google news API ignores some parameters from the common :ref:`google API`:
|
||||
|
||||
- num_ : the number of search results is ignored / there is no paging all
|
||||
results for a query term are in the first response.
|
||||
- save_ : is ignored / Google-News results are always *SafeSearch*
|
||||
|
||||
.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
|
||||
.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
|
||||
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
|
||||
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import binascii
|
||||
import re
|
||||
from urllib.parse import urlencode
|
||||
from base64 import b64decode
|
||||
from lxml import html
|
||||
import babel
|
||||
|
||||
from searx import locales
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
eval_xpath_list,
|
||||
|
@ -26,18 +42,19 @@ from searx.utils import (
|
|||
extract_text,
|
||||
)
|
||||
|
||||
# pylint: disable=unused-import
|
||||
from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
supported_languages_url,
|
||||
_fetch_supported_languages,
|
||||
)
|
||||
|
||||
# pylint: enable=unused-import
|
||||
|
||||
from searx.engines.google import (
|
||||
get_lang_info,
|
||||
get_google_info,
|
||||
detect_google_sorry,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -49,70 +66,77 @@ about = {
|
|||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# compared to other google engines google-news has a different time range
|
||||
# support. The time range is included in the search term.
|
||||
time_range_dict = {
|
||||
'day': 'when:1d',
|
||||
'week': 'when:7d',
|
||||
'month': 'when:1m',
|
||||
'year': 'when:1y',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
|
||||
categories = ['news']
|
||||
paging = False
|
||||
use_locale_domain = True
|
||||
time_range_support = True
|
||||
time_range_support = False
|
||||
|
||||
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
|
||||
# False here, otherwise checker will report safesearch-errors::
|
||||
#
|
||||
# safesearch : results are identitical for safesearch=0 and safesearch=2
|
||||
safesearch = False
|
||||
send_accept_language_header = True
|
||||
safesearch = True
|
||||
# send_accept_language_header = True
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google-News search request"""
|
||||
|
||||
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
|
||||
sxng_locale = params.get('searxng_locale', 'en-US')
|
||||
ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
|
||||
google_info = get_google_info(params, traits)
|
||||
google_info['subdomain'] = 'news.google.com' # google news has only one domain
|
||||
|
||||
# google news has only one domain
|
||||
lang_info['subdomain'] = 'news.google.com'
|
||||
ceid_region, ceid_lang = ceid.split(':')
|
||||
ceid_lang, ceid_suffix = (
|
||||
ceid_lang.split('-')
|
||||
+ [
|
||||
None,
|
||||
]
|
||||
)[:2]
|
||||
|
||||
ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
|
||||
google_info['params']['hl'] = ceid_lang
|
||||
|
||||
# google news redirects en to en-US
|
||||
if lang_info['params']['hl'] == 'en':
|
||||
lang_info['params']['hl'] = 'en-US'
|
||||
if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
|
||||
|
||||
# Very special to google-news compared to other google engines, the time
|
||||
# range is included in the search term.
|
||||
if params['time_range']:
|
||||
query += ' ' + time_range_dict[params['time_range']]
|
||||
if ceid_region.lower() == ceid_lang:
|
||||
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
|
||||
else:
|
||||
google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
|
||||
|
||||
elif ceid_region.lower() != ceid_lang:
|
||||
|
||||
if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
|
||||
google_info['params']['hl'] = ceid_lang
|
||||
else:
|
||||
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
|
||||
|
||||
google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
|
||||
google_info['params']['gl'] = ceid_region
|
||||
|
||||
query_url = (
|
||||
'https://'
|
||||
+ lang_info['subdomain']
|
||||
+ '/search'
|
||||
+ "?"
|
||||
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']})
|
||||
+ google_info['subdomain']
|
||||
+ "/search?"
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
**google_info['params'],
|
||||
}
|
||||
)
|
||||
# ceid includes a ':' character which must not be urlencoded
|
||||
+ ('&ceid=%s' % ceid)
|
||||
) # ceid includes a ':' character which must not be urlencoded
|
||||
)
|
||||
|
||||
params['url'] = query_url
|
||||
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Get response from google's search request"""
|
||||
results = []
|
||||
|
||||
detect_google_sorry(resp)
|
||||
|
||||
# convert the text to dom
|
||||
|
@ -152,8 +176,8 @@ def response(resp):
|
|||
|
||||
# The pub_date is mostly a string like 'yesertday', not a real
|
||||
# timezone date or time. Therefore we can't use publishedDate.
|
||||
pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time'))
|
||||
pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a'))
|
||||
pub_date = extract_text(eval_xpath(result, './article//time'))
|
||||
pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
|
||||
|
||||
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
|
||||
|
||||
|
@ -174,3 +198,127 @@ def response(resp):
|
|||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
ceid_list = [
|
||||
'AE:ar',
|
||||
'AR:es-419',
|
||||
'AT:de',
|
||||
'AU:en',
|
||||
'BD:bn',
|
||||
'BE:fr',
|
||||
'BE:nl',
|
||||
'BG:bg',
|
||||
'BR:pt-419',
|
||||
'BW:en',
|
||||
'CA:en',
|
||||
'CA:fr',
|
||||
'CH:de',
|
||||
'CH:fr',
|
||||
'CL:es-419',
|
||||
'CN:zh-Hans',
|
||||
'CO:es-419',
|
||||
'CU:es-419',
|
||||
'CZ:cs',
|
||||
'DE:de',
|
||||
'EG:ar',
|
||||
'ES:es',
|
||||
'ET:en',
|
||||
'FR:fr',
|
||||
'GB:en',
|
||||
'GH:en',
|
||||
'GR:el',
|
||||
'HK:zh-Hant',
|
||||
'HU:hu',
|
||||
'ID:en',
|
||||
'ID:id',
|
||||
'IE:en',
|
||||
'IL:en',
|
||||
'IL:he',
|
||||
'IN:bn',
|
||||
'IN:en',
|
||||
'IN:hi',
|
||||
'IN:ml',
|
||||
'IN:mr',
|
||||
'IN:ta',
|
||||
'IN:te',
|
||||
'IT:it',
|
||||
'JP:ja',
|
||||
'KE:en',
|
||||
'KR:ko',
|
||||
'LB:ar',
|
||||
'LT:lt',
|
||||
'LV:en',
|
||||
'LV:lv',
|
||||
'MA:fr',
|
||||
'MX:es-419',
|
||||
'MY:en',
|
||||
'NA:en',
|
||||
'NG:en',
|
||||
'NL:nl',
|
||||
'NO:no',
|
||||
'NZ:en',
|
||||
'PE:es-419',
|
||||
'PH:en',
|
||||
'PK:en',
|
||||
'PL:pl',
|
||||
'PT:pt-150',
|
||||
'RO:ro',
|
||||
'RS:sr',
|
||||
'RU:ru',
|
||||
'SA:ar',
|
||||
'SE:sv',
|
||||
'SG:en',
|
||||
'SI:sl',
|
||||
'SK:sk',
|
||||
'SN:fr',
|
||||
'TH:th',
|
||||
'TR:tr',
|
||||
'TW:zh-Hant',
|
||||
'TZ:en',
|
||||
'UA:ru',
|
||||
'UA:uk',
|
||||
'UG:en',
|
||||
'US:en',
|
||||
'US:es-419',
|
||||
'VE:es-419',
|
||||
'VN:vi',
|
||||
'ZA:en',
|
||||
'ZW:en',
|
||||
]
|
||||
"""List of region/language combinations supported by Google News. Values of the
|
||||
``ceid`` argument of the Google News REST API."""
|
||||
|
||||
|
||||
_skip_values = [
|
||||
'ET:en', # english (ethiopia)
|
||||
'ID:en', # english (indonesia)
|
||||
'LV:en', # english (latvia)
|
||||
]
|
||||
|
||||
_ceid_locale_map = {'NO:no': 'nb-NO'}
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
_fetch_traits(engine_traits, add_domains=False)
|
||||
|
||||
engine_traits.custom['ceid'] = {}
|
||||
|
||||
for ceid in ceid_list:
|
||||
if ceid in _skip_values:
|
||||
continue
|
||||
|
||||
region, lang = ceid.split(':')
|
||||
x = lang.split('-')
|
||||
if len(x) > 1:
|
||||
if x[1] not in ['Hant', 'Hans']:
|
||||
lang = x[0]
|
||||
|
||||
sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
|
||||
try:
|
||||
locale = babel.Locale.parse(sxng_locale, sep='-')
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
|
||||
continue
|
||||
|
||||
engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
|
||||
|
|
|
@ -1,19 +1,18 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Google (Scholar)
|
||||
"""This is the implementation of the Google Scholar engine.
|
||||
|
||||
For detailed description of the *REST-full* API see: `Query Parameter
|
||||
Definitions`_.
|
||||
|
||||
.. _Query Parameter Definitions:
|
||||
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
|
||||
Compared to other Google services the Scholar engine has a simple GET REST-API
|
||||
and there does not exists `async` API. Even though the API slightly vintage we
|
||||
can make use of the :ref:`google API` to assemble the arguments of the GET
|
||||
request.
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Optional
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from lxml import html
|
||||
|
||||
from searx.utils import (
|
||||
|
@ -23,19 +22,21 @@ from searx.utils import (
|
|||
extract_text,
|
||||
)
|
||||
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
|
||||
from searx.engines.google import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
get_lang_info,
|
||||
get_google_info,
|
||||
time_range_dict,
|
||||
detect_google_sorry,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
# pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
supported_languages_url,
|
||||
_fetch_supported_languages,
|
||||
)
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
# pylint: enable=unused-import
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -51,53 +52,62 @@ about = {
|
|||
categories = ['science', 'scientific publications']
|
||||
paging = True
|
||||
language_support = True
|
||||
use_locale_domain = True
|
||||
time_range_support = True
|
||||
safesearch = False
|
||||
send_accept_language_header = True
|
||||
|
||||
|
||||
def time_range_url(params):
|
||||
"""Returns a URL query component for a google-Scholar time range based on
|
||||
``params['time_range']``. Google-Scholar does only support ranges in years.
|
||||
To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*)
|
||||
are mapped to *year*. If no range is set, an empty string is returned.
|
||||
Example::
|
||||
def time_range_args(params):
|
||||
"""Returns a dictionary with a time range arguments based on
|
||||
``params['time_range']``.
|
||||
|
||||
Google Scholar supports a detailed search by year. Searching by *last
|
||||
month* or *last week* (as offered by SearXNG) is uncommon for scientific
|
||||
publications and is not supported by Google Scholar.
|
||||
|
||||
To limit the result list when the users selects a range, all the SearXNG
|
||||
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
|
||||
is set an empty dictionary of arguments is returned. Example; when
|
||||
user selects a time range (current year minus one in 2022):
|
||||
|
||||
.. code:: python
|
||||
|
||||
{ 'as_ylo' : 2021 }
|
||||
|
||||
&as_ylo=2019
|
||||
"""
|
||||
# as_ylo=2016&as_yhi=2019
|
||||
ret_val = ''
|
||||
ret_val = {}
|
||||
if params['time_range'] in time_range_dict:
|
||||
ret_val = urlencode({'as_ylo': datetime.now().year - 1})
|
||||
return '&' + ret_val
|
||||
ret_val['as_ylo'] = datetime.now().year - 1
|
||||
return ret_val
|
||||
|
||||
|
||||
def detect_google_captcha(dom):
|
||||
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
|
||||
not redirected to ``sorry.google.com``.
|
||||
"""
|
||||
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google-Scholar search request"""
|
||||
|
||||
offset = (params['pageno'] - 1) * 10
|
||||
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
|
||||
|
||||
google_info = get_google_info(params, traits)
|
||||
# subdomain is: scholar.google.xy
|
||||
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
|
||||
google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
|
||||
|
||||
query_url = (
|
||||
'https://'
|
||||
+ lang_info['subdomain']
|
||||
+ '/scholar'
|
||||
+ "?"
|
||||
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset})
|
||||
)
|
||||
args = {
|
||||
'q': query,
|
||||
**google_info['params'],
|
||||
'start': (params['pageno'] - 1) * 10,
|
||||
'as_sdt': '2007', # include patents / to disable set '0,5'
|
||||
'as_vis': '0', # include citations / to disable set '1'
|
||||
}
|
||||
args.update(time_range_args(params))
|
||||
|
||||
query_url += time_range_url(params)
|
||||
params['url'] = query_url
|
||||
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
||||
# params['google_subdomain'] = subdomain
|
||||
params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
|
@ -138,19 +148,15 @@ def parse_gs_a(text: Optional[str]):
|
|||
|
||||
|
||||
def response(resp): # pylint: disable=too-many-locals
|
||||
"""Get response from google's search request"""
|
||||
"""Parse response from Google Scholar"""
|
||||
results = []
|
||||
|
||||
detect_google_sorry(resp)
|
||||
|
||||
# which subdomain ?
|
||||
# subdomain = resp.search_params.get('google_subdomain')
|
||||
|
||||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
detect_google_captcha(dom)
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath_list(dom, '//div[@data-cid]'):
|
||||
for result in eval_xpath_list(dom, '//div[@data-rp]'):
|
||||
|
||||
title = extract_text(eval_xpath(result, './/h3[1]//a'))
|
||||
|
||||
|
@ -158,7 +164,7 @@ def response(resp): # pylint: disable=too-many-locals
|
|||
# this is a [ZITATION] block
|
||||
continue
|
||||
|
||||
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
|
||||
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
|
||||
if pub_type:
|
||||
pub_type = pub_type[1:-1].lower()
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""This is the implementation of the google videos engine.
|
||||
"""This is the implementation of the Google Videos engine.
|
||||
|
||||
.. admonition:: Content-Security-Policy (CSP)
|
||||
|
||||
|
@ -14,9 +14,8 @@
|
|||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import re
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
|
||||
|
@ -27,20 +26,22 @@ from searx.utils import (
|
|||
extract_text,
|
||||
)
|
||||
|
||||
from searx.engines.google import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.google import (
|
||||
get_lang_info,
|
||||
get_google_info,
|
||||
time_range_dict,
|
||||
filter_mapping,
|
||||
g_section_with_header,
|
||||
title_xpath,
|
||||
suggestion_xpath,
|
||||
detect_google_sorry,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
# pylint: disable=unused-import
|
||||
from searx.engines.google import supported_languages_url, _fetch_supported_languages
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
# pylint: enable=unused-import
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -55,70 +56,32 @@ about = {
|
|||
# engine dependent config
|
||||
|
||||
categories = ['videos', 'web']
|
||||
paging = False
|
||||
paging = True
|
||||
language_support = True
|
||||
use_locale_domain = True
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
||||
RE_CACHE = {}
|
||||
|
||||
|
||||
def _re(regexpr):
|
||||
"""returns compiled regular expression"""
|
||||
RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
|
||||
return RE_CACHE[regexpr]
|
||||
|
||||
|
||||
def scrap_out_thumbs_src(dom):
|
||||
ret_val = {}
|
||||
thumb_name = 'dimg_'
|
||||
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
|
||||
_script = script.text
|
||||
# "dimg_35":"https://i.ytimg.c....",
|
||||
_dimurl = _re("s='([^']*)").findall(_script)
|
||||
for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script):
|
||||
v = v.replace(r'\u003d', '=')
|
||||
v = v.replace(r'\u0026', '&')
|
||||
ret_val[k] = v
|
||||
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
|
||||
return ret_val
|
||||
|
||||
|
||||
def scrap_out_thumbs(dom):
|
||||
"""Scrap out thumbnail data from <script> tags."""
|
||||
ret_val = {}
|
||||
thumb_name = 'dimg_'
|
||||
|
||||
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
|
||||
_script = script.text
|
||||
|
||||
# var s='data:image/jpeg;base64, ...'
|
||||
_imgdata = _re("s='([^']*)").findall(_script)
|
||||
if not _imgdata:
|
||||
continue
|
||||
|
||||
# var ii=['dimg_17']
|
||||
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
|
||||
# At least the equal sign in the URL needs to be decoded
|
||||
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
|
||||
|
||||
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
|
||||
return ret_val
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google-Video search request"""
|
||||
|
||||
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
|
||||
google_info = get_google_info(params, traits)
|
||||
|
||||
query_url = (
|
||||
'https://'
|
||||
+ lang_info['subdomain']
|
||||
+ google_info['subdomain']
|
||||
+ '/search'
|
||||
+ "?"
|
||||
+ urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"})
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
'tbm': "vid",
|
||||
'start': 10 * params['pageno'],
|
||||
**google_info['params'],
|
||||
'asearch': 'arc',
|
||||
'async': 'use_ac:true,_fmt:html',
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
if params['time_range'] in time_range_dict:
|
||||
|
@ -127,9 +90,8 @@ def request(query, params):
|
|||
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
||||
params['url'] = query_url
|
||||
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
params['cookies'] = google_info['cookies']
|
||||
params['headers'].update(google_info['headers'])
|
||||
return params
|
||||
|
||||
|
||||
|
@ -141,43 +103,30 @@ def response(resp):
|
|||
|
||||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
vidthumb_imgdata = scrap_out_thumbs(dom)
|
||||
thumbs_src = scrap_out_thumbs_src(dom)
|
||||
logger.debug(str(thumbs_src))
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
|
||||
|
||||
# ignore google *sections*
|
||||
if extract_text(eval_xpath(result, g_section_with_header)):
|
||||
logger.debug("ignoring <g-section-with-header>")
|
||||
img_src = eval_xpath_getindex(result, './/img/@src', 0, None)
|
||||
if img_src is None:
|
||||
continue
|
||||
|
||||
# ingnore articles without an image id / e.g. news articles
|
||||
img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
|
||||
if img_id is None:
|
||||
logger.error("no img_id found in item %s (news article?)", len(results) + 1)
|
||||
continue
|
||||
title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
|
||||
url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)
|
||||
|
||||
img_src = vidthumb_imgdata.get(img_id, None)
|
||||
if not img_src:
|
||||
img_src = thumbs_src.get(img_id, "")
|
||||
|
||||
title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
|
||||
url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
|
||||
length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span'))
|
||||
c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
|
||||
content = extract_text(c_node)
|
||||
pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
|
||||
pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]'))
|
||||
length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]'))
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'length': length,
|
||||
'author': pub_info,
|
||||
'thumbnail': img_src,
|
||||
'length': length,
|
||||
'template': 'videos.html',
|
||||
}
|
||||
)
|
||||
|
|
|
@ -1,18 +1,30 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
peertube (Videos)
|
||||
# lint: pylint
|
||||
"""Peertube and :py:obj:`SepiaSearch <searx.engines.sepiasearch>` do share
|
||||
(more or less) the same REST API and the schema of the JSON result is identical.
|
||||
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from datetime import datetime
|
||||
import re
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import html_to_text
|
||||
from datetime import datetime
|
||||
from dateutil.parser import parse
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
import babel
|
||||
|
||||
from searx import network
|
||||
from searx.locales import language_tag
|
||||
from searx.utils import html_to_text
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
# pylint: disable=line-too-long
|
||||
"website": 'https://joinpeertube.org',
|
||||
"wikidata_id": 'Q50938515',
|
||||
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html',
|
||||
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
|
@ -22,66 +34,155 @@ about = {
|
|||
categories = ["videos"]
|
||||
paging = True
|
||||
base_url = "https://peer.tube"
|
||||
supported_languages_url = 'https://peer.tube/api/v1/videos/languages'
|
||||
"""Base URL of the Peertube instance. A list of instances is available at:
|
||||
|
||||
- https://instances.joinpeertube.org/instances
|
||||
"""
|
||||
|
||||
time_range_support = True
|
||||
time_range_table = {
|
||||
'day': relativedelta(),
|
||||
'week': relativedelta(weeks=-1),
|
||||
'month': relativedelta(months=-1),
|
||||
'year': relativedelta(years=-1),
|
||||
}
|
||||
|
||||
safesearch = True
|
||||
safesearch_table = {0: 'both', 1: 'false', 2: 'false'}
|
||||
|
||||
|
||||
def minute_to_hm(minute):
|
||||
if isinstance(minute, int):
|
||||
return "%d:%02d" % (divmod(minute, 60))
|
||||
return None
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
sanitized_url = base_url.rstrip("/")
|
||||
pageno = (params["pageno"] - 1) * 15
|
||||
search_url = sanitized_url + "/api/v1/search/videos/?pageno={pageno}&{query}"
|
||||
query_dict = {"search": query}
|
||||
language = params["language"].split("-")[0]
|
||||
if "all" != language and language in supported_languages:
|
||||
query_dict["languageOneOf"] = language
|
||||
params["url"] = search_url.format(query=urlencode(query_dict), pageno=pageno)
|
||||
"""Assemble request for the Peertube API"""
|
||||
|
||||
if not query:
|
||||
return False
|
||||
|
||||
# eng_region = traits.get_region(params['searxng_locale'], 'en_US')
|
||||
eng_lang = traits.get_language(params['searxng_locale'], None)
|
||||
|
||||
params['url'] = (
|
||||
base_url.rstrip("/")
|
||||
+ "/api/v1/search/videos?"
|
||||
+ urlencode(
|
||||
{
|
||||
'search': query,
|
||||
'searchTarget': 'search-index', # Vidiversum
|
||||
'resultType': 'videos',
|
||||
'start': (params['pageno'] - 1) * 10,
|
||||
'count': 10,
|
||||
# -createdAt: sort by date ascending / createdAt: date descending
|
||||
'sort': '-match', # sort by *match descending*
|
||||
'nsfw': safesearch_table[params['safesearch']],
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
if eng_lang is not None:
|
||||
params['url'] += '&languageOneOf[]=' + eng_lang
|
||||
params['url'] += '&boostLanguages[]=' + eng_lang
|
||||
|
||||
if params['time_range'] in time_range_table:
|
||||
time = datetime.now().date() + time_range_table[params['time_range']]
|
||||
params['url'] += '&startDate=' + time.isoformat()
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def _get_offset_from_pageno(pageno):
|
||||
return (pageno - 1) * 15 + 1
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
sanitized_url = base_url.rstrip("/")
|
||||
return video_response(resp)
|
||||
|
||||
|
||||
def video_response(resp):
|
||||
"""Parse video response from SepiaSearch and Peertube instances."""
|
||||
results = []
|
||||
|
||||
search_res = loads(resp.text)
|
||||
json_data = resp.json()
|
||||
|
||||
# return empty array if there are no results
|
||||
if "data" not in search_res:
|
||||
if 'data' not in json_data:
|
||||
return []
|
||||
|
||||
# parse results
|
||||
for res in search_res["data"]:
|
||||
title = res["name"]
|
||||
url = sanitized_url + "/videos/watch/" + res["uuid"]
|
||||
description = res["description"]
|
||||
if description:
|
||||
content = html_to_text(res["description"])
|
||||
else:
|
||||
content = ""
|
||||
thumbnail = sanitized_url + res["thumbnailPath"]
|
||||
publishedDate = datetime.strptime(res["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
for result in json_data['data']:
|
||||
metadata = [
|
||||
x
|
||||
for x in [
|
||||
result.get('channel', {}).get('displayName'),
|
||||
result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'),
|
||||
', '.join(result.get('tags', [])),
|
||||
]
|
||||
if x
|
||||
]
|
||||
|
||||
results.append(
|
||||
{
|
||||
"template": "videos.html",
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"publishedDate": publishedDate,
|
||||
"iframe_src": sanitized_url + res["embedPath"],
|
||||
"thumbnail": thumbnail,
|
||||
'url': result['url'],
|
||||
'title': result['name'],
|
||||
'content': html_to_text(result.get('description') or ''),
|
||||
'author': result.get('account', {}).get('displayName'),
|
||||
'length': minute_to_hm(result.get('duration')),
|
||||
'template': 'videos.html',
|
||||
'publishedDate': parse(result['publishedAt']),
|
||||
'iframe_src': result.get('embedUrl'),
|
||||
'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'),
|
||||
'metadata': ' | '.join(metadata),
|
||||
}
|
||||
)
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
def _fetch_supported_languages(resp):
|
||||
videolanguages = resp.json()
|
||||
peertube_languages = list(videolanguages.keys())
|
||||
return peertube_languages
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages from peertube's search-index source code.
|
||||
|
||||
See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_
|
||||
|
||||
.. _8ed5c729 - Refactor and redesign client:
|
||||
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729
|
||||
.. _videoLanguages:
|
||||
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291
|
||||
"""
|
||||
|
||||
resp = network.get(
|
||||
'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue',
|
||||
# the response from search-index repository is very slow
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
if not resp.ok:
|
||||
print("ERROR: response from peertube is not OK.")
|
||||
return
|
||||
|
||||
js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL)
|
||||
if not js_lang:
|
||||
print("ERROR: can't determine languages from peertube")
|
||||
return
|
||||
|
||||
for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)):
|
||||
try:
|
||||
eng_tag = lang.group(1)
|
||||
if eng_tag == 'oc':
|
||||
# Occitanis not known by babel, its closest relative is Catalan
|
||||
# but 'ca' is already in the list of engine_traits.languages -->
|
||||
# 'oc' will be ignored.
|
||||
continue
|
||||
|
||||
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
|
||||
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: %s is unknown by babel" % eng_tag)
|
||||
continue
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.languages[sxng_tag] = eng_tag
|
||||
|
||||
engine_traits.languages['zh_Hans'] = 'zh'
|
||||
engine_traits.languages['zh_Hant'] = 'zh'
|
||||
|
|
|
@ -34,7 +34,9 @@ import babel
|
|||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.network import raise_for_httperror
|
||||
from searx.locales import get_engine_locale
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -49,7 +51,6 @@ about = {
|
|||
# engine dependent config
|
||||
categories = []
|
||||
paging = True
|
||||
supported_languages_url = about['website']
|
||||
qwant_categ = None # web|news|inages|videos
|
||||
|
||||
safesearch = True
|
||||
|
@ -95,7 +96,7 @@ def request(query, params):
|
|||
)
|
||||
|
||||
# add quant's locale
|
||||
q_locale = get_engine_locale(params['language'], supported_languages, default='en_US')
|
||||
q_locale = traits.get_region(params["searxng_locale"], default='en_US')
|
||||
params['url'] += '&locale=' + q_locale
|
||||
|
||||
# add safesearch option
|
||||
|
@ -243,15 +244,20 @@ def response(resp):
|
|||
return results
|
||||
|
||||
|
||||
def _fetch_supported_languages(resp):
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from searx import network
|
||||
from searx.locales import region_tag
|
||||
|
||||
resp = network.get(about['website'])
|
||||
text = resp.text
|
||||
text = text[text.find('INITIAL_PROPS') :]
|
||||
text = text[text.find('{') : text.find('</script>')]
|
||||
|
||||
q_initial_props = loads(text)
|
||||
q_locales = q_initial_props.get('locales')
|
||||
q_valid_locales = []
|
||||
eng_tag_list = set()
|
||||
|
||||
for country, v in q_locales.items():
|
||||
for lang in v['langs']:
|
||||
|
@ -261,25 +267,18 @@ def _fetch_supported_languages(resp):
|
|||
# qwant-news does not support all locales from qwant-web:
|
||||
continue
|
||||
|
||||
q_valid_locales.append(_locale)
|
||||
eng_tag_list.add(_locale)
|
||||
|
||||
supported_languages = {}
|
||||
|
||||
for q_locale in q_valid_locales:
|
||||
for eng_tag in eng_tag_list:
|
||||
try:
|
||||
locale = babel.Locale.parse(q_locale, sep='_')
|
||||
except babel.core.UnknownLocaleError:
|
||||
print("ERROR: can't determine babel locale of quant's locale %s" % q_locale)
|
||||
sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_'))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag)
|
||||
continue
|
||||
|
||||
# note: supported_languages (dict)
|
||||
#
|
||||
# dict's key is a string build up from a babel.Locale object / the
|
||||
# notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and
|
||||
# language) notation and dict's values are the locale strings used by
|
||||
# the engine.
|
||||
|
||||
searxng_locale = locale.language + '-' + locale.territory # --> params['language']
|
||||
supported_languages[searxng_locale] = q_locale
|
||||
|
||||
return supported_languages
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = eng_tag
|
||||
|
|
|
@ -1,70 +1,80 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
SepiaSearch (Videos)
|
||||
# lint: pylint
|
||||
"""SepiaSearch uses the same languages as :py:obj:`Peertube
|
||||
<searx.engines.peertube>` and the response is identical to the response from the
|
||||
peertube engines.
|
||||
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from dateutil import parser, relativedelta
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
# about
|
||||
from searx.engines.peertube import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.peertube import (
|
||||
# pylint: disable=unused-import
|
||||
video_response,
|
||||
safesearch_table,
|
||||
time_range_table,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
about = {
|
||||
# pylint: disable=line-too-long
|
||||
"website": 'https://sepiasearch.org',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": "https://framagit.org/framasoft/peertube/search-index/-/tree/master/server/controllers/api", # NOQA
|
||||
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos']
|
||||
paging = True
|
||||
|
||||
base_url = 'https://sepiasearch.org'
|
||||
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
supported_languages = [
|
||||
# fmt: off
|
||||
'en', 'fr', 'ja', 'eu', 'ca', 'cs', 'eo', 'el',
|
||||
'de', 'it', 'nl', 'es', 'oc', 'gd', 'zh', 'pt',
|
||||
'sv', 'pl', 'fi', 'ru'
|
||||
# fmt: on
|
||||
]
|
||||
base_url = 'https://sepiasearch.org/api/v1/search/videos'
|
||||
|
||||
safesearch_table = {0: 'both', 1: 'false', 2: 'false'}
|
||||
|
||||
time_range_table = {
|
||||
'day': relativedelta.relativedelta(),
|
||||
'week': relativedelta.relativedelta(weeks=-1),
|
||||
'month': relativedelta.relativedelta(months=-1),
|
||||
'year': relativedelta.relativedelta(years=-1),
|
||||
}
|
||||
|
||||
|
||||
def minute_to_hm(minute):
|
||||
if isinstance(minute, int):
|
||||
return "%d:%02d" % (divmod(minute, 60))
|
||||
return None
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Assemble request for the SepiaSearch API"""
|
||||
|
||||
if not query:
|
||||
return False
|
||||
|
||||
# eng_region = traits.get_region(params['searxng_locale'], 'en_US')
|
||||
eng_lang = traits.get_language(params['searxng_locale'], None)
|
||||
|
||||
params['url'] = (
|
||||
base_url
|
||||
+ '?'
|
||||
base_url.rstrip("/")
|
||||
+ "/api/v1/search/videos?"
|
||||
+ urlencode(
|
||||
{
|
||||
'search': query,
|
||||
'start': (params['pageno'] - 1) * 10,
|
||||
'count': 10,
|
||||
'sort': '-match',
|
||||
# -createdAt: sort by date ascending / createdAt: date descending
|
||||
'sort': '-match', # sort by *match descending*
|
||||
'nsfw': safesearch_table[params['safesearch']],
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
language = params['language'].split('-')[0]
|
||||
if language in supported_languages:
|
||||
params['url'] += '&languageOneOf[]=' + language
|
||||
if eng_lang is not None:
|
||||
params['url'] += '&languageOneOf[]=' + eng_lang
|
||||
params['url'] += '&boostLanguages[]=' + eng_lang
|
||||
|
||||
if params['time_range'] in time_range_table:
|
||||
time = datetime.now().date() + time_range_table[params['time_range']]
|
||||
params['url'] += '&startDate=' + time.isoformat()
|
||||
|
@ -73,34 +83,4 @@ def request(query, params):
|
|||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_results = loads(resp.text)
|
||||
|
||||
if 'data' not in search_results:
|
||||
return []
|
||||
|
||||
for result in search_results['data']:
|
||||
title = result['name']
|
||||
content = result['description']
|
||||
thumbnail = result['thumbnailUrl']
|
||||
publishedDate = parser.parse(result['publishedAt'])
|
||||
author = result.get('account', {}).get('displayName')
|
||||
length = minute_to_hm(result.get('duration'))
|
||||
url = result['url']
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'author': author,
|
||||
'length': length,
|
||||
'template': 'videos.html',
|
||||
'publishedDate': publishedDate,
|
||||
'iframe_src': result.get('embedUrl'),
|
||||
'thumbnail': thumbnail,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
return video_response(resp)
|
||||
|
|
|
@ -1,28 +1,108 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Startpage (Web)
|
||||
"""Startpage's language & region selectors are a mess ..
|
||||
|
||||
.. _startpage regions:
|
||||
|
||||
Startpage regions
|
||||
=================
|
||||
|
||||
In the list of regions there are tags we need to map to common region tags::
|
||||
|
||||
pt-BR_BR --> pt_BR
|
||||
zh-CN_CN --> zh_Hans_CN
|
||||
zh-TW_TW --> zh_Hant_TW
|
||||
zh-TW_HK --> zh_Hant_HK
|
||||
en-GB_GB --> en_GB
|
||||
|
||||
and there is at least one tag with a three letter language tag (ISO 639-2)::
|
||||
|
||||
fil_PH --> fil_PH
|
||||
|
||||
The locale code ``no_NO`` from Startpage does not exists and is mapped to
|
||||
``nb-NO``::
|
||||
|
||||
babel.core.UnknownLocaleError: unknown locale 'no_NO'
|
||||
|
||||
For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
|
||||
W3C recommends subtag over macrolanguage [2]_.
|
||||
|
||||
.. [1] `iana: language-subtag-registry
|
||||
<https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
|
||||
|
||||
type: language
|
||||
Subtag: nb
|
||||
Description: Norwegian Bokmål
|
||||
Added: 2005-10-16
|
||||
Suppress-Script: Latn
|
||||
Macrolanguage: no
|
||||
|
||||
.. [2]
|
||||
Use macrolanguages with care. Some language subtags have a Scope field set to
|
||||
macrolanguage, i.e. this primary language subtag encompasses a number of more
|
||||
specific primary language subtags in the registry. ... As we recommended for
|
||||
the collection subtags mentioned above, in most cases you should try to use
|
||||
the more specific subtags ... `W3: The primary language subtag
|
||||
<https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
|
||||
|
||||
.. _startpage languages:
|
||||
|
||||
Startpage languages
|
||||
===================
|
||||
|
||||
:py:obj:`send_accept_language_header`:
|
||||
The displayed name in Startpage's settings page depend on the location of the
|
||||
IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
|
||||
we use::
|
||||
|
||||
'Accept-Language': "en-US,en;q=0.5",
|
||||
..
|
||||
|
||||
to get uniform names independent from the IP).
|
||||
|
||||
.. _startpage categories:
|
||||
|
||||
Startpage categories
|
||||
====================
|
||||
|
||||
Startpage's category (for Web-search, News, Videos, ..) is set by
|
||||
:py:obj:`startpage_categ` in settings.yml::
|
||||
|
||||
- name: startpage
|
||||
engine: startpage
|
||||
startpage_categ: web
|
||||
...
|
||||
|
||||
.. hint::
|
||||
|
||||
The default category is ``web`` .. and other categories than ``web`` are not
|
||||
yet implemented.
|
||||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from collections import OrderedDict
|
||||
import re
|
||||
from time import time
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from unicodedata import normalize, combining
|
||||
from time import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from dateutil import parser
|
||||
from lxml import html
|
||||
from babel import Locale
|
||||
from babel.localedata import locale_identifiers
|
||||
import dateutil.parser
|
||||
import lxml.html
|
||||
import babel
|
||||
|
||||
from searx.network import get
|
||||
from searx.utils import extract_text, eval_xpath, match_language
|
||||
from searx.exceptions import (
|
||||
SearxEngineResponseException,
|
||||
SearxEngineCaptchaException,
|
||||
)
|
||||
from searx import network
|
||||
from searx.utils import extract_text, eval_xpath, gen_useragent
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.locales import region_tag
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -34,18 +114,28 @@ about = {
|
|||
"results": 'HTML',
|
||||
}
|
||||
|
||||
startpage_categ = 'web'
|
||||
"""Startpage's category, visit :ref:`startpage categories`.
|
||||
"""
|
||||
|
||||
send_accept_language_header = True
|
||||
"""Startpage tries to guess user's language and territory from the HTTP
|
||||
``Accept-Language``. Optional the user can select a search-language (can be
|
||||
different to the UI language) and a region filter.
|
||||
"""
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
# there is a mechanism to block "bot" search
|
||||
# (probably the parameter qid), require
|
||||
# storing of qid's between mulitble search-calls
|
||||
|
||||
paging = True
|
||||
supported_languages_url = 'https://www.startpage.com/do/settings'
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||
safesearch_dict = {0: '0', 1: '1', 2: '1'}
|
||||
|
||||
# search-url
|
||||
base_url = 'https://startpage.com/'
|
||||
search_url = base_url + 'sp/search?'
|
||||
base_url = 'https://www.startpage.com'
|
||||
search_url = base_url + '/sp/search'
|
||||
|
||||
# specific xpath variables
|
||||
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
|
||||
|
@ -53,92 +143,193 @@ search_url = base_url + 'sp/search?'
|
|||
results_xpath = '//div[@class="w-gl__result__main"]'
|
||||
link_xpath = './/a[@class="w-gl__result-title result-link"]'
|
||||
content_xpath = './/p[@class="w-gl__description"]'
|
||||
search_form_xpath = '//form[@id="search"]'
|
||||
"""XPath of Startpage's origin search form
|
||||
|
||||
.. code: html
|
||||
|
||||
<form action="/sp/search" method="post">
|
||||
<input type="text" name="query" value="" ..>
|
||||
<input type="hidden" name="t" value="device">
|
||||
<input type="hidden" name="lui" value="english">
|
||||
<input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
|
||||
<input type="hidden" name="cat" value="web">
|
||||
<input type="hidden" class="abp" id="abp-input" name="abp" value="1">
|
||||
</form>
|
||||
"""
|
||||
|
||||
# timestamp of the last fetch of 'sc' code
|
||||
sc_code_ts = 0
|
||||
sc_code = ''
|
||||
sc_code_cache_sec = 30
|
||||
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
|
||||
|
||||
|
||||
def raise_captcha(resp):
|
||||
def get_sc_code(searxng_locale, params):
|
||||
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
|
||||
|
||||
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
|
||||
raise SearxEngineCaptchaException()
|
||||
Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
|
||||
<search_form_xpath>`. Without this argument Startpage considers the request
|
||||
is from a bot. We do not know what is encoded in the value of the ``sc``
|
||||
argument, but it seems to be a kind of a *time-stamp*.
|
||||
|
||||
|
||||
def get_sc_code(headers):
|
||||
"""Get an actual `sc` argument from startpage's home page.
|
||||
|
||||
Startpage puts a `sc` argument on every link. Without this argument
|
||||
startpage considers the request is from a bot. We do not know what is
|
||||
encoded in the value of the `sc` argument, but it seems to be a kind of a
|
||||
*time-stamp*. This *time-stamp* is valid for a few hours.
|
||||
|
||||
This function scrap a new *time-stamp* from startpage's home page every hour
|
||||
(3000 sec).
|
||||
Startpage's search form generates a new sc-code on each request. This
|
||||
function scrap a new sc-code from Startpage's home page every
|
||||
:py:obj:`sc_code_cache_sec` seconds.
|
||||
|
||||
"""
|
||||
|
||||
global sc_code_ts, sc_code # pylint: disable=global-statement
|
||||
|
||||
if time() > (sc_code_ts + 3000):
|
||||
logger.debug("query new sc time-stamp ...")
|
||||
if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
|
||||
logger.debug("get_sc_code: reuse '%s'", sc_code)
|
||||
return sc_code
|
||||
|
||||
resp = get(base_url, headers=headers)
|
||||
raise_captcha(resp)
|
||||
dom = html.fromstring(resp.text)
|
||||
headers = {**params['headers']}
|
||||
headers['Origin'] = base_url
|
||||
headers['Referer'] = base_url + '/'
|
||||
# headers['Connection'] = 'keep-alive'
|
||||
# headers['Accept-Encoding'] = 'gzip, deflate, br'
|
||||
# headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
|
||||
# headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
|
||||
|
||||
try:
|
||||
# <input type="hidden" name="sc" value="...">
|
||||
sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0]
|
||||
except IndexError as exc:
|
||||
# suspend startpage API --> https://github.com/searxng/searxng/pull/695
|
||||
raise SearxEngineResponseException(
|
||||
suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!"
|
||||
) from exc
|
||||
# add Accept-Language header
|
||||
if searxng_locale == 'all':
|
||||
searxng_locale = 'en-US'
|
||||
locale = babel.Locale.parse(searxng_locale, sep='-')
|
||||
|
||||
sc_code_ts = time()
|
||||
logger.debug("new value is: %s", sc_code)
|
||||
if send_accept_language_header:
|
||||
ac_lang = locale.language
|
||||
if locale.territory:
|
||||
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
|
||||
locale.language,
|
||||
locale.territory,
|
||||
locale.language,
|
||||
)
|
||||
headers['Accept-Language'] = ac_lang
|
||||
|
||||
get_sc_url = base_url + '/?sc=%s' % (sc_code)
|
||||
logger.debug("query new sc time-stamp ... %s", get_sc_url)
|
||||
logger.debug("headers: %s", headers)
|
||||
resp = network.get(get_sc_url, headers=headers)
|
||||
|
||||
# ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
|
||||
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
|
||||
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
|
||||
|
||||
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
|
||||
raise SearxEngineCaptchaException(
|
||||
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
|
||||
)
|
||||
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
|
||||
try:
|
||||
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
|
||||
except IndexError as exc:
|
||||
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
|
||||
raise SearxEngineCaptchaException(
|
||||
message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
|
||||
) from exc
|
||||
|
||||
sc_code_ts = time()
|
||||
logger.debug("get_sc_code: new value is: %s", sc_code)
|
||||
return sc_code
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
"""Assemble a Startpage request.
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
# The format string from Startpage's FFox add-on [1]::
|
||||
#
|
||||
# https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0
|
||||
#
|
||||
# [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/
|
||||
To avoid CAPTCHA we need to send a well formed HTTP POST request with a
|
||||
cookie. We need to form a request that is identical to the request build by
|
||||
Startpage's search form:
|
||||
|
||||
- in the cookie the **region** is selected
|
||||
- in the HTTP POST data the **language** is selected
|
||||
|
||||
Additionally the arguments form Startpage's search form needs to be set in
|
||||
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
|
||||
"""
|
||||
if startpage_categ == 'web':
|
||||
return _request_cat_web(query, params)
|
||||
|
||||
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
|
||||
return params
|
||||
|
||||
|
||||
def _request_cat_web(query, params):
|
||||
|
||||
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
||||
|
||||
# build arguments
|
||||
args = {
|
||||
'query': query,
|
||||
'page': params['pageno'],
|
||||
'cat': 'web',
|
||||
# 'pl': 'ext-ff',
|
||||
# 'extVersion': '1.3.0',
|
||||
# 'abp': "-1",
|
||||
'sc': get_sc_code(params['headers']),
|
||||
't': 'device',
|
||||
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
|
||||
'with_date': time_range_dict.get(params['time_range'], ''),
|
||||
}
|
||||
|
||||
# set language if specified
|
||||
if params['language'] != 'all':
|
||||
lang_code = match_language(params['language'], supported_languages, fallback=None)
|
||||
if lang_code:
|
||||
language_name = supported_languages[lang_code]['alias']
|
||||
args['language'] = language_name
|
||||
args['lui'] = language_name
|
||||
if engine_language:
|
||||
args['language'] = engine_language
|
||||
args['lui'] = engine_language
|
||||
|
||||
args['abp'] = '1'
|
||||
if params['pageno'] > 1:
|
||||
args['page'] = params['pageno']
|
||||
|
||||
# build cookie
|
||||
lang_homepage = 'en'
|
||||
cookie = OrderedDict()
|
||||
cookie['date_time'] = 'world'
|
||||
cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
|
||||
cookie['disable_open_in_new_window'] = '0'
|
||||
cookie['enable_post_method'] = '1' # hint: POST
|
||||
cookie['enable_proxy_safety_suggest'] = '1'
|
||||
cookie['enable_stay_control'] = '1'
|
||||
cookie['instant_answers'] = '1'
|
||||
cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
|
||||
cookie['num_of_results'] = '10'
|
||||
cookie['suggestions'] = '1'
|
||||
cookie['wt_unit'] = 'celsius'
|
||||
|
||||
if engine_language:
|
||||
cookie['language'] = engine_language
|
||||
cookie['language_ui'] = engine_language
|
||||
|
||||
if engine_region:
|
||||
cookie['search_results_region'] = engine_region
|
||||
|
||||
params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
|
||||
logger.debug('cookie preferences: %s', params['cookies']['preferences'])
|
||||
|
||||
# POST request
|
||||
logger.debug("data: %s", args)
|
||||
params['data'] = args
|
||||
params['method'] = 'POST'
|
||||
params['url'] = search_url
|
||||
params['headers']['Origin'] = base_url
|
||||
params['headers']['Referer'] = base_url + '/'
|
||||
# is the Accept header needed?
|
||||
# params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
||||
params['url'] = search_url + urlencode(args)
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
if startpage_categ == 'web':
|
||||
return _response_cat_web(dom)
|
||||
|
||||
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
|
||||
return []
|
||||
|
||||
|
||||
def _response_cat_web(dom):
|
||||
results = []
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath(dom, results_xpath):
|
||||
|
@ -173,7 +364,7 @@ def response(resp):
|
|||
content = content[date_pos:]
|
||||
|
||||
try:
|
||||
published_date = parser.parse(date_string, dayfirst=True)
|
||||
published_date = dateutil.parser.parse(date_string, dayfirst=True)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
@ -199,62 +390,103 @@ def response(resp):
|
|||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def _fetch_supported_languages(resp):
|
||||
# startpage's language selector is a mess each option has a displayed name
|
||||
# and a value, either of which may represent the language name in the native
|
||||
# script, the language name in English, an English transliteration of the
|
||||
# native name, the English name of the writing script used by the language,
|
||||
# or occasionally something else entirely.
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
|
||||
regions>` from Startpage."""
|
||||
# pylint: disable=too-many-branches
|
||||
|
||||
# this cases are so special they need to be hardcoded, a couple of them are misspellings
|
||||
language_names = {
|
||||
'english_uk': 'en-GB',
|
||||
'fantizhengwen': ['zh-TW', 'zh-HK'],
|
||||
'hangul': 'ko',
|
||||
'malayam': 'ml',
|
||||
'norsk': 'nb',
|
||||
'sinhalese': 'si',
|
||||
'sudanese': 'su',
|
||||
headers = {
|
||||
'User-Agent': gen_useragent(),
|
||||
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
|
||||
}
|
||||
resp = network.get('https://www.startpage.com/do/settings', headers=headers)
|
||||
|
||||
# get the English name of every language known by babel
|
||||
language_names.update(
|
||||
{
|
||||
# fmt: off
|
||||
name.lower(): lang_code
|
||||
# pylint: disable=protected-access
|
||||
for lang_code, name in Locale('en')._data['languages'].items()
|
||||
# fmt: on
|
||||
}
|
||||
)
|
||||
if not resp.ok:
|
||||
print("ERROR: response from Startpage is not OK.")
|
||||
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
|
||||
# regions
|
||||
|
||||
sp_region_names = []
|
||||
for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
|
||||
sp_region_names.append(option.get('value'))
|
||||
|
||||
for eng_tag in sp_region_names:
|
||||
if eng_tag == 'all':
|
||||
continue
|
||||
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
|
||||
|
||||
if '-' in babel_region_tag:
|
||||
l, r = babel_region_tag.split('-')
|
||||
r = r.split('_')[-1]
|
||||
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
|
||||
|
||||
else:
|
||||
try:
|
||||
sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
|
||||
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
|
||||
continue
|
||||
|
||||
conflict = engine_traits.regions.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.regions[sxng_tag] = eng_tag
|
||||
|
||||
# languages
|
||||
|
||||
catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
|
||||
|
||||
# get the native name of every language known by babel
|
||||
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
|
||||
native_name = Locale(lang_code).get_language_name().lower()
|
||||
|
||||
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
|
||||
native_name = babel.Locale(lang_code).get_language_name().lower()
|
||||
# add native name exactly as it is
|
||||
language_names[native_name] = lang_code
|
||||
catalog_engine2code[native_name] = lang_code
|
||||
|
||||
# add "normalized" language name (i.e. français becomes francais and español becomes espanol)
|
||||
unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
|
||||
if len(unaccented_name) == len(unaccented_name.encode()):
|
||||
# add only if result is ascii (otherwise "normalization" didn't work)
|
||||
language_names[unaccented_name] = lang_code
|
||||
catalog_engine2code[unaccented_name] = lang_code
|
||||
|
||||
# values that can't be determined by babel's languages names
|
||||
|
||||
catalog_engine2code.update(
|
||||
{
|
||||
# traditional chinese used in ..
|
||||
'fantizhengwen': 'zh_Hant',
|
||||
# Korean alphabet
|
||||
'hangul': 'ko',
|
||||
# Malayalam is one of 22 scheduled languages of India.
|
||||
'malayam': 'ml',
|
||||
'norsk': 'nb',
|
||||
'sinhalese': 'si',
|
||||
}
|
||||
)
|
||||
|
||||
skip_eng_tags = {
|
||||
'english_uk', # SearXNG lang 'en' already maps to 'english'
|
||||
}
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
sp_lang_names = []
|
||||
for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
|
||||
sp_lang_names.append((option.get('value'), extract_text(option).lower()))
|
||||
|
||||
supported_languages = {}
|
||||
for sp_option_value, sp_option_text in sp_lang_names:
|
||||
lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
|
||||
if isinstance(lang_code, str):
|
||||
supported_languages[lang_code] = {'alias': sp_option_value}
|
||||
elif isinstance(lang_code, list):
|
||||
for _lc in lang_code:
|
||||
supported_languages[_lc] = {'alias': sp_option_value}
|
||||
else:
|
||||
print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))
|
||||
eng_tag = option.get('value')
|
||||
if eng_tag in skip_eng_tags:
|
||||
continue
|
||||
name = extract_text(option).lower()
|
||||
|
||||
return supported_languages
|
||||
sxng_tag = catalog_engine2code.get(eng_tag)
|
||||
if sxng_tag is None:
|
||||
sxng_tag = catalog_engine2code[name]
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.languages[sxng_tag] = eng_tag
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Wikidata
|
||||
"""This module implements the Wikidata engine. Some implementations are shared
|
||||
from :ref:`wikipedia engine`.
|
||||
|
||||
"""
|
||||
# pylint: disable=missing-class-docstring
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from hashlib import md5
|
||||
from urllib.parse import urlencode, unquote
|
||||
from json import loads
|
||||
|
@ -13,12 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
|
|||
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx.network import post, get
|
||||
from searx.utils import match_language, searx_useragent, get_string_replaces_function
|
||||
from searx.utils import searx_useragent, get_string_replaces_function
|
||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
|
||||
from searx.engines.wikipedia import ( # pylint: disable=unused-import
|
||||
_fetch_supported_languages,
|
||||
supported_languages_url,
|
||||
)
|
||||
from searx.engines.wikipedia import fetch_traits as _fetch_traits
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger: logging.Logger
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -154,33 +162,35 @@ def send_wikidata_query(query, method='GET'):
|
|||
|
||||
|
||||
def request(query, params):
|
||||
language = params['language'].split('-')[0]
|
||||
if language == 'all':
|
||||
language = 'en'
|
||||
else:
|
||||
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
|
||||
|
||||
# wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
|
||||
# mapped to zh
|
||||
sxng_lang = params['searxng_locale'].split('-')[0]
|
||||
language = traits.get_language(sxng_lang, 'en')
|
||||
|
||||
query, attributes = get_query(query, language)
|
||||
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
|
||||
|
||||
params['method'] = 'POST'
|
||||
params['url'] = SPARQL_ENDPOINT_URL
|
||||
params['data'] = {'query': query}
|
||||
params['headers'] = get_headers()
|
||||
|
||||
params['language'] = language
|
||||
params['attributes'] = attributes
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
jsonresponse = loads(resp.content.decode())
|
||||
|
||||
language = resp.search_params['language'].lower()
|
||||
language = resp.search_params['language']
|
||||
attributes = resp.search_params['attributes']
|
||||
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
|
||||
|
||||
seen_entities = set()
|
||||
|
||||
for result in jsonresponse.get('results', {}).get('bindings', []):
|
||||
attribute_result = {key: value['value'] for key, value in result.items()}
|
||||
entity_url = attribute_result['item']
|
||||
|
@ -756,3 +766,15 @@ def init(engine_settings=None): # pylint: disable=unused-argument
|
|||
lang = result['name']['xml:lang']
|
||||
entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
||||
WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Use languages evaluated from :py:obj:`wikipedia.fetch_traits
|
||||
<searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
|
||||
is not supported by wikidata."""
|
||||
|
||||
_fetch_traits(engine_traits)
|
||||
# wikidata does not support zh-classical (zh_Hans)
|
||||
engine_traits.languages.pop('zh_Hans')
|
||||
# wikidata does not have net-locations for the languages
|
||||
engine_traits.custom['wiki_netloc'] = {}
|
||||
|
|
|
@ -1,13 +1,26 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Wikipedia (Web)
|
||||
# lint: pylint
|
||||
"""This module implements the Wikipedia engine. Some of this implementations
|
||||
are shared by other engines:
|
||||
|
||||
- :ref:`wikidata engine`
|
||||
|
||||
The list of supported languages is fetched from the article linked by
|
||||
:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia
|
||||
does not support one Wikipedia for all the languages, but there is one Wikipedia
|
||||
for every language (:py:obj:`fetch_traits`).
|
||||
"""
|
||||
|
||||
from urllib.parse import quote
|
||||
from json import loads
|
||||
from lxml.html import fromstring
|
||||
from searx.utils import match_language, searx_useragent
|
||||
from searx.network import raise_for_httperror
|
||||
import urllib.parse
|
||||
import babel
|
||||
|
||||
from lxml import html
|
||||
|
||||
from searx import network
|
||||
from searx.locales import language_tag
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -19,32 +32,40 @@ about = {
|
|||
"results": 'JSON',
|
||||
}
|
||||
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
# search-url
|
||||
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
|
||||
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
||||
language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")}
|
||||
wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
|
||||
"""The *editing depth* of Wikipedia is one of several possible rough indicators
|
||||
of the encyclopedia's collaborative quality, showing how frequently its articles
|
||||
are updated. The measurement of depth was introduced after some limitations of
|
||||
the classic measurement of article count were realized.
|
||||
"""
|
||||
|
||||
# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
|
||||
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
|
||||
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
|
||||
the first paragraph of the page in plain text and HTML as well as the type of
|
||||
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
|
||||
and link previews in the apps.
|
||||
|
||||
.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
|
||||
|
||||
"""
|
||||
|
||||
|
||||
# set language in base_url
|
||||
def url_lang(lang):
|
||||
lang_pre = lang.split('-')[0]
|
||||
if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
|
||||
return 'en'
|
||||
return match_language(lang, supported_languages, language_aliases).split('-')[0]
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
"""Assemble a request (`wikipedia rest_v1 summary API`_)."""
|
||||
if query.islower():
|
||||
query = query.title()
|
||||
|
||||
language = url_lang(params['language'])
|
||||
params['url'] = search_url.format(title=quote(query), language=language)
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
||||
wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
|
||||
title = urllib.parse.quote(query)
|
||||
|
||||
# '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
|
||||
# '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
|
||||
params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
|
||||
|
||||
params['headers']['User-Agent'] = searx_useragent()
|
||||
params['raise_for_httperror'] = False
|
||||
params['soft_max_redirects'] = 2
|
||||
|
||||
|
@ -53,13 +74,14 @@ def request(query, params):
|
|||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
|
||||
results = []
|
||||
if resp.status_code == 404:
|
||||
return []
|
||||
|
||||
if resp.status_code == 400:
|
||||
try:
|
||||
api_result = loads(resp.text)
|
||||
except:
|
||||
api_result = resp.json()
|
||||
except Exception: # pylint: disable=broad-except
|
||||
pass
|
||||
else:
|
||||
if (
|
||||
|
@ -68,49 +90,135 @@ def response(resp):
|
|||
):
|
||||
return []
|
||||
|
||||
raise_for_httperror(resp)
|
||||
|
||||
results = []
|
||||
api_result = loads(resp.text)
|
||||
|
||||
# skip disambiguation pages
|
||||
if api_result.get('type') != 'standard':
|
||||
return []
|
||||
network.raise_for_httperror(resp)
|
||||
|
||||
api_result = resp.json()
|
||||
title = api_result['title']
|
||||
wikipedia_link = api_result['content_urls']['desktop']['page']
|
||||
results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
|
||||
|
||||
results.append({'url': wikipedia_link, 'title': title})
|
||||
|
||||
results.append(
|
||||
{
|
||||
'infobox': title,
|
||||
'id': wikipedia_link,
|
||||
'content': api_result.get('extract', ''),
|
||||
'img_src': api_result.get('thumbnail', {}).get('source'),
|
||||
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
|
||||
}
|
||||
)
|
||||
if api_result.get('type') == 'standard':
|
||||
results.append(
|
||||
{
|
||||
'infobox': title,
|
||||
'id': wikipedia_link,
|
||||
'content': api_result.get('extract', ''),
|
||||
'img_src': api_result.get('thumbnail', {}).get('source'),
|
||||
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def _fetch_supported_languages(resp):
|
||||
supported_languages = {}
|
||||
dom = fromstring(resp.text)
|
||||
tables = dom.xpath('//table[contains(@class,"sortable")]')
|
||||
for table in tables:
|
||||
# exclude header row
|
||||
trs = table.xpath('.//tr')[1:]
|
||||
for tr in trs:
|
||||
td = tr.xpath('./td')
|
||||
code = td[3].xpath('./a')[0].text
|
||||
name = td[1].xpath('./a')[0].text
|
||||
english_name = td[1].xpath('./a')[0].text
|
||||
articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
|
||||
# exclude languages with too few articles
|
||||
if articles >= 100:
|
||||
supported_languages[code] = {"name": name, "english_name": english_name}
|
||||
# Nonstandard language codes
|
||||
#
|
||||
# These Wikipedias use language codes that do not conform to the ISO 639
|
||||
# standard (which is how wiki subdomains are chosen nowadays).
|
||||
|
||||
return supported_languages
|
||||
lang_map = {
|
||||
'be-tarask': 'bel',
|
||||
'ak': 'aka',
|
||||
'als': 'gsw',
|
||||
'bat-smg': 'sgs',
|
||||
'cbk-zam': 'cbk',
|
||||
'fiu-vro': 'vro',
|
||||
'map-bms': 'map',
|
||||
'nrm': 'nrf',
|
||||
'roa-rup': 'rup',
|
||||
'nds-nl': 'nds',
|
||||
#'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
|
||||
'zh-min-nan': 'nan',
|
||||
'zh-yue': 'yue',
|
||||
'an': 'arg',
|
||||
'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason)
|
||||
}
|
||||
|
||||
unknown_langs = [
|
||||
'an', # Aragonese
|
||||
'ba', # Bashkir
|
||||
'bar', # Bavarian
|
||||
'bcl', # Central Bicolano
|
||||
'be-tarask', # Belarusian variant / Belarusian is already covered by 'be'
|
||||
'bpy', # Bishnupriya Manipuri is unknown by babel
|
||||
'hif', # Fiji Hindi
|
||||
'ilo', # Ilokano
|
||||
'li', # Limburgish
|
||||
'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
|
||||
'sh', # Serbo-Croatian
|
||||
'simple', # simple english is not know as a natural language different to english (babel)
|
||||
'vo', # Volapük
|
||||
'wa', # Walloon
|
||||
]
|
||||
|
||||
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages from Wikipedia.
|
||||
|
||||
The location of the Wikipedia address of a language is mapped in a
|
||||
:py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
|
||||
(``wiki_netloc``). Here is a reduced example:
|
||||
|
||||
.. code:: python
|
||||
|
||||
traits.custom['wiki_netloc'] = {
|
||||
"en": "en.wikipedia.org",
|
||||
..
|
||||
"gsw": "als.wikipedia.org",
|
||||
..
|
||||
"zh": "zh.wikipedia.org",
|
||||
"zh-classical": "zh-classical.wikipedia.org"
|
||||
}
|
||||
|
||||
"""
|
||||
|
||||
engine_traits.custom['wiki_netloc'] = {}
|
||||
|
||||
# insert alias to map from a region like zh-CN to a language zh_Hans
|
||||
engine_traits.languages['zh_Hans'] = 'zh'
|
||||
|
||||
resp = network.get(wikipedia_article_depth)
|
||||
if not resp.ok:
|
||||
print("ERROR: response from Wikipedia is not OK.")
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
|
||||
|
||||
cols = row.xpath('./td')
|
||||
if not cols:
|
||||
continue
|
||||
cols = [c.text_content().strip() for c in cols]
|
||||
|
||||
depth = float(cols[3].replace('-', '0').replace(',', ''))
|
||||
articles = int(cols[4].replace(',', '').replace(',', ''))
|
||||
|
||||
if articles < 10000:
|
||||
# exclude languages with too few articles
|
||||
continue
|
||||
|
||||
if int(depth) < 20:
|
||||
# Rough indicator of a Wikipedia’s quality, showing how frequently
|
||||
# its articles are updated.
|
||||
continue
|
||||
|
||||
eng_tag = cols[2]
|
||||
wiki_url = row.xpath('./td[3]/a/@href')[0]
|
||||
wiki_url = urllib.parse.urlparse(wiki_url)
|
||||
|
||||
if eng_tag in unknown_langs:
|
||||
continue
|
||||
|
||||
try:
|
||||
sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
|
||||
except babel.UnknownLocaleError:
|
||||
print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
|
||||
continue
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
|
||||
engine_traits.languages[sxng_tag] = eng_tag
|
||||
engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
|
||||
|
|
|
@ -17,8 +17,10 @@ from searx.utils import (
|
|||
eval_xpath_getindex,
|
||||
eval_xpath_list,
|
||||
extract_text,
|
||||
match_language,
|
||||
)
|
||||
from searx.enginelib.traits import EngineTraits
|
||||
|
||||
traits: EngineTraits
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -34,8 +36,7 @@ about = {
|
|||
categories = ['general', 'web']
|
||||
paging = True
|
||||
time_range_support = True
|
||||
supported_languages_url = 'https://search.yahoo.com/preferences/languages'
|
||||
"""Supported languages are read from Yahoo preference page."""
|
||||
# send_accept_language_header = True
|
||||
|
||||
time_range_dict = {
|
||||
'day': ('1d', 'd'),
|
||||
|
@ -43,15 +44,10 @@ time_range_dict = {
|
|||
'month': ('1m', 'm'),
|
||||
}
|
||||
|
||||
language_aliases = {
|
||||
'zh-HK': 'zh_chs',
|
||||
'zh-CN': 'zh_chs', # dead since 2015 / routed to hk.search.yahoo.com
|
||||
'zh-TW': 'zh_cht',
|
||||
}
|
||||
|
||||
lang2domain = {
|
||||
'zh_chs': 'hk.search.yahoo.com',
|
||||
'zh_cht': 'tw.search.yahoo.com',
|
||||
'any': 'search.yahoo.com',
|
||||
'en': 'search.yahoo.com',
|
||||
'bg': 'search.yahoo.com',
|
||||
'cs': 'search.yahoo.com',
|
||||
|
@ -67,21 +63,23 @@ lang2domain = {
|
|||
}
|
||||
"""Map language to domain"""
|
||||
|
||||
|
||||
def _get_language(params):
|
||||
|
||||
lang = language_aliases.get(params['language'])
|
||||
if lang is None:
|
||||
lang = match_language(params['language'], supported_languages, language_aliases)
|
||||
lang = lang.split('-')[0]
|
||||
logger.debug("params['language']: %s --> %s", params['language'], lang)
|
||||
return lang
|
||||
locale_aliases = {
|
||||
'zh': 'zh_Hans',
|
||||
'zh-HK': 'zh_Hans',
|
||||
'zh-CN': 'zh_Hans', # dead since 2015 / routed to hk.search.yahoo.com
|
||||
'zh-TW': 'zh_Hant',
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""build request"""
|
||||
|
||||
lang = locale_aliases.get(params['language'], None)
|
||||
if not lang:
|
||||
lang = params['language'].split('-')[0]
|
||||
lang = traits.get_language(lang, traits.all_locale)
|
||||
|
||||
offset = (params['pageno'] - 1) * 7 + 1
|
||||
lang = _get_language(params)
|
||||
age, btf = time_range_dict.get(params['time_range'], ('', ''))
|
||||
|
||||
args = urlencode(
|
||||
|
@ -154,13 +152,37 @@ def response(resp):
|
|||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def _fetch_supported_languages(resp):
|
||||
supported_languages = []
|
||||
def fetch_traits(engine_traits: EngineTraits):
|
||||
"""Fetch languages from yahoo"""
|
||||
|
||||
# pylint: disable=import-outside-toplevel
|
||||
import babel
|
||||
from searx import network
|
||||
from searx.locales import language_tag
|
||||
|
||||
engine_traits.all_locale = 'any'
|
||||
|
||||
resp = network.get('https://search.yahoo.com/preferences/languages')
|
||||
if not resp.ok:
|
||||
print("ERROR: response from peertube is not OK.")
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
offset = len('lang_')
|
||||
|
||||
for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'):
|
||||
supported_languages.append(val[offset:])
|
||||
eng2sxng = {'zh_chs': 'zh_Hans', 'zh_cht': 'zh_Hant'}
|
||||
|
||||
return supported_languages
|
||||
for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'):
|
||||
eng_tag = val[offset:]
|
||||
|
||||
try:
|
||||
sxng_tag = language_tag(babel.Locale.parse(eng2sxng.get(eng_tag, eng_tag)))
|
||||
except babel.UnknownLocaleError:
|
||||
print('ERROR: unknown language --> %s' % eng_tag)
|
||||
continue
|
||||
|
||||
conflict = engine_traits.languages.get(sxng_tag)
|
||||
if conflict:
|
||||
if conflict != eng_tag:
|
||||
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
|
||||
continue
|
||||
engine_traits.languages[sxng_tag] = eng_tag
|
||||
|
|
190
searx/locales.py
190
searx/locales.py
|
@ -4,11 +4,11 @@
|
|||
"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`.
|
||||
"""
|
||||
|
||||
from typing import Set
|
||||
from typing import Set, Optional, List
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
from babel import Locale
|
||||
import babel
|
||||
from babel.support import Translations
|
||||
import babel.languages
|
||||
import babel.core
|
||||
|
@ -134,7 +134,7 @@ def locales_initialize(directory=None):
|
|||
flask_babel.get_translations = get_translations
|
||||
|
||||
for tag, descr in ADDITIONAL_TRANSLATIONS.items():
|
||||
locale = Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
|
||||
locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
|
||||
LOCALE_NAMES[tag] = descr
|
||||
if locale.text_direction == 'rtl':
|
||||
RTL_LOCALES.add(tag)
|
||||
|
@ -142,7 +142,7 @@ def locales_initialize(directory=None):
|
|||
for tag in LOCALE_BEST_MATCH:
|
||||
descr = LOCALE_NAMES.get(tag)
|
||||
if not descr:
|
||||
locale = Locale.parse(tag, sep='-')
|
||||
locale = babel.Locale.parse(tag, sep='-')
|
||||
LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_'))
|
||||
if locale.text_direction == 'rtl':
|
||||
RTL_LOCALES.add(tag)
|
||||
|
@ -154,12 +154,77 @@ def locales_initialize(directory=None):
|
|||
tag = dirname.replace('_', '-')
|
||||
descr = LOCALE_NAMES.get(tag)
|
||||
if not descr:
|
||||
locale = Locale.parse(dirname)
|
||||
locale = babel.Locale.parse(dirname)
|
||||
LOCALE_NAMES[tag] = get_locale_descr(locale, dirname)
|
||||
if locale.text_direction == 'rtl':
|
||||
RTL_LOCALES.add(tag)
|
||||
|
||||
|
||||
def region_tag(locale: babel.Locale) -> str:
|
||||
"""Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US)."""
|
||||
if not locale.territory:
|
||||
raise ValueError('%s missed a territory')
|
||||
return locale.language + '-' + locale.territory
|
||||
|
||||
|
||||
def language_tag(locale: babel.Locale) -> str:
|
||||
"""Returns SearXNG's language tag from the locale and if exits, the tag
|
||||
includes the script name (e.g. en, zh_Hant).
|
||||
"""
|
||||
sxng_lang = locale.language
|
||||
if locale.script:
|
||||
sxng_lang += '_' + locale.script
|
||||
return sxng_lang
|
||||
|
||||
|
||||
def get_locale(locale_tag: str) -> Optional[babel.Locale]:
|
||||
"""Returns a :py:obj:`babel.Locale` object parsed from argument
|
||||
``locale_tag``"""
|
||||
try:
|
||||
locale = babel.Locale.parse(locale_tag, sep='-')
|
||||
return locale
|
||||
|
||||
except babel.core.UnknownLocaleError:
|
||||
return None
|
||||
|
||||
|
||||
def get_offical_locales(
|
||||
territory: str, languages=None, regional: bool = False, de_facto: bool = True
|
||||
) -> Set[babel.Locale]:
|
||||
"""Returns a list of :py:obj:`babel.Locale` with languages from
|
||||
:py:obj:`babel.languages.get_official_languages`.
|
||||
|
||||
:param territory: The territory (country or region) code.
|
||||
|
||||
:param languages: A list of language codes the languages from
|
||||
:py:obj:`babel.languages.get_official_languages` should be in
|
||||
(intersection). If this argument is ``None``, all official languages in
|
||||
this territory are used.
|
||||
|
||||
:param regional: If the regional flag is set, then languages which are
|
||||
regionally official are also returned.
|
||||
|
||||
:param de_facto: If the de_facto flag is set to `False`, then languages
|
||||
which are “de facto” official are not returned.
|
||||
|
||||
"""
|
||||
ret_val = set()
|
||||
o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto)
|
||||
|
||||
if languages:
|
||||
languages = [l.lower() for l in languages]
|
||||
o_languages = set(l for l in o_languages if l.lower() in languages)
|
||||
|
||||
for lang in o_languages:
|
||||
try:
|
||||
locale = babel.Locale.parse(lang + '_' + territory)
|
||||
ret_val.add(locale)
|
||||
except babel.UnknownLocaleError:
|
||||
continue
|
||||
|
||||
return ret_val
|
||||
|
||||
|
||||
def get_engine_locale(searxng_locale, engine_locales, default=None):
|
||||
"""Return engine's language (aka locale) string that best fits to argument
|
||||
``searxng_locale``.
|
||||
|
@ -177,6 +242,10 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
|
|||
...
|
||||
'pl-PL' : 'pl_PL',
|
||||
'pt-PT' : 'pt_PT'
|
||||
..
|
||||
'zh' : 'zh'
|
||||
'zh_Hans' : 'zh'
|
||||
'zh_Hant' : 'zh-classical'
|
||||
}
|
||||
|
||||
.. hint::
|
||||
|
@ -210,13 +279,13 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
|
|||
engine.
|
||||
|
||||
"""
|
||||
# pylint: disable=too-many-branches
|
||||
# pylint: disable=too-many-branches, too-many-return-statements
|
||||
|
||||
engine_locale = engine_locales.get(searxng_locale)
|
||||
|
||||
if engine_locale is not None:
|
||||
# There was a 1:1 mapping (e.g. "fr-BE --> fr_BE" or "fr --> fr_FR"), no
|
||||
# need to narrow language nor territory.
|
||||
# There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language
|
||||
# "zh --> zh"), no need to narrow language-script nor territory.
|
||||
return engine_locale
|
||||
|
||||
try:
|
||||
|
@ -227,6 +296,12 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
|
|||
except babel.core.UnknownLocaleError:
|
||||
return default
|
||||
|
||||
searxng_lang = language_tag(locale)
|
||||
engine_locale = engine_locales.get(searxng_lang)
|
||||
if engine_locale is not None:
|
||||
# There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans")
|
||||
return engine_locale
|
||||
|
||||
# SearXNG's selected locale is not supported by the engine ..
|
||||
|
||||
if locale.territory:
|
||||
|
@ -247,10 +322,6 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
|
|||
|
||||
if locale.language:
|
||||
|
||||
searxng_lang = locale.language
|
||||
if locale.script:
|
||||
searxng_lang += '_' + locale.script
|
||||
|
||||
terr_lang_dict = {}
|
||||
for territory, langs in babel.core.get_global("territory_languages").items():
|
||||
if not langs.get(searxng_lang, {}).get('official_status'):
|
||||
|
@ -303,3 +374,98 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
|
|||
engine_locale = default
|
||||
|
||||
return default
|
||||
|
||||
|
||||
def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]:
|
||||
"""Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``.
|
||||
|
||||
:param str searxng_locale: SearXNG's internal representation of locale (de,
|
||||
de-DE, fr-BE, zh, zh-CN, zh-TW ..).
|
||||
|
||||
:param list locale_tag_list: The list of locale tags to select from
|
||||
|
||||
:param str fallback: fallback locale tag (if unset --> ``None``)
|
||||
|
||||
The rules to find a match are implemented in :py:obj:`get_engine_locale`,
|
||||
the ``engine_locales`` is build up by :py:obj:`build_engine_locales`.
|
||||
|
||||
.. hint::
|
||||
|
||||
The *SearXNG locale* string and the members of ``locale_tag_list`` has to
|
||||
be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the
|
||||
UI and are not known by babel --> will be ignored.
|
||||
"""
|
||||
|
||||
# searxng_locale = 'es'
|
||||
# locale_tag_list = ['es-AR', 'es-ES', 'es-MX']
|
||||
|
||||
if not searxng_locale:
|
||||
return fallback
|
||||
|
||||
locale = get_locale(searxng_locale)
|
||||
if locale is None:
|
||||
return fallback
|
||||
|
||||
# normalize to a SearXNG locale that can be passed to get_engine_locale
|
||||
|
||||
searxng_locale = language_tag(locale)
|
||||
if locale.territory:
|
||||
searxng_locale = region_tag(locale)
|
||||
|
||||
# clean up locale_tag_list
|
||||
|
||||
tag_list = []
|
||||
for tag in locale_tag_list:
|
||||
if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS:
|
||||
continue
|
||||
tag_list.append(tag)
|
||||
|
||||
# emulate fetch_traits
|
||||
engine_locales = build_engine_locales(tag_list)
|
||||
return get_engine_locale(searxng_locale, engine_locales, default=fallback)
|
||||
|
||||
|
||||
def build_engine_locales(tag_list: List[str]):
|
||||
"""From a list of locale tags a dictionary is build that can be passed by
|
||||
argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function
|
||||
is mainly used by :py:obj:`match_locale` and is similar to what the
|
||||
``fetch_traits(..)`` function of engines do.
|
||||
|
||||
If there are territory codes in the ``tag_list`` that have a *script code*
|
||||
additional keys are added to the returned dictionary.
|
||||
|
||||
.. code:: python
|
||||
|
||||
>>> import locales
|
||||
>>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW'])
|
||||
>>> engine_locales
|
||||
{
|
||||
'en': 'en', 'en-US': 'en-US',
|
||||
'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN',
|
||||
'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW'
|
||||
}
|
||||
>>> get_engine_locale('zh-Hans', engine_locales)
|
||||
'zh-CN'
|
||||
|
||||
This function is a good example to understand the language/region model
|
||||
of SearXNG:
|
||||
|
||||
SearXNG only distinguishes between **search languages** and **search
|
||||
regions**, by adding the *script-tags*, languages with *script-tags* can
|
||||
be assigned to the **regions** that SearXNG supports.
|
||||
|
||||
"""
|
||||
engine_locales = {}
|
||||
|
||||
for tag in tag_list:
|
||||
locale = get_locale(tag)
|
||||
if locale is None:
|
||||
logger.warn("build_engine_locales: skip locale tag %s / unknown by babel", tag)
|
||||
continue
|
||||
if locale.territory:
|
||||
engine_locales[region_tag(locale)] = tag
|
||||
if locale.script:
|
||||
engine_locales[language_tag(locale)] = tag
|
||||
else:
|
||||
engine_locales[language_tag(locale)] = tag
|
||||
return engine_locales
|
||||
|
|
|
@ -13,7 +13,7 @@ from typing import Iterable, Dict, List
|
|||
import flask
|
||||
|
||||
from searx import settings, autocomplete
|
||||
from searx.engines import Engine
|
||||
from searx.enginelib import Engine
|
||||
from searx.plugins import Plugin
|
||||
from searx.locales import LOCALE_NAMES
|
||||
from searx.webutils import VALID_LANGUAGE_CODE
|
||||
|
|
|
@ -4,7 +4,7 @@ from abc import abstractmethod, ABC
|
|||
import re
|
||||
|
||||
from searx import settings
|
||||
from searx.languages import language_codes
|
||||
from searx.sxng_locales import sxng_locales
|
||||
from searx.engines import categories, engines, engine_shortcuts
|
||||
from searx.external_bang import get_bang_definition_and_autocomplete
|
||||
from searx.search import EngineRef
|
||||
|
@ -84,7 +84,7 @@ class LanguageParser(QueryPartParser):
|
|||
found = False
|
||||
# check if any language-code is equal with
|
||||
# declared language-codes
|
||||
for lc in language_codes:
|
||||
for lc in sxng_locales:
|
||||
lang_id, lang_name, country, english_name, _flag = map(str.lower, lc)
|
||||
|
||||
# if correct language-code is found
|
||||
|
@ -125,7 +125,7 @@ class LanguageParser(QueryPartParser):
|
|||
self.raw_text_query.autocomplete_list.append(lang)
|
||||
return
|
||||
|
||||
for lc in language_codes:
|
||||
for lc in sxng_locales:
|
||||
if lc[0] not in settings['search']['languages']:
|
||||
continue
|
||||
lang_id, lang_name, country, english_name, _flag = map(str.lower, lc)
|
||||
|
|
|
@ -30,7 +30,10 @@ from .abstract import EngineProcessor
|
|||
|
||||
logger = logger.getChild('search.processors')
|
||||
PROCESSORS: Dict[str, EngineProcessor] = {}
|
||||
"""Cache request processores, stored by *engine-name* (:py:func:`initialize`)"""
|
||||
"""Cache request processores, stored by *engine-name* (:py:func:`initialize`)
|
||||
|
||||
:meta hide-value:
|
||||
"""
|
||||
|
||||
|
||||
def get_processor_class(engine_type):
|
||||
|
|
|
@ -138,7 +138,8 @@ class EngineProcessor(ABC):
|
|||
return False
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
"""Returns a set of *request params* or ``None`` if request is not supported.
|
||||
"""Returns a set of (see :ref:`request params <engine request arguments>`) or
|
||||
``None`` if request is not supported.
|
||||
|
||||
Not supported conditions (``None`` is returned):
|
||||
|
||||
|
@ -159,11 +160,20 @@ class EngineProcessor(ABC):
|
|||
params['safesearch'] = search_query.safesearch
|
||||
params['time_range'] = search_query.time_range
|
||||
params['engine_data'] = search_query.engine_data.get(self.engine_name, {})
|
||||
params['searxng_locale'] = search_query.lang
|
||||
|
||||
# deprecated / vintage --> use params['searxng_locale']
|
||||
#
|
||||
# Conditions related to engine's traits are implemented in engine.traits
|
||||
# module. Don't do 'locale' decissions here in the abstract layer of the
|
||||
# search processor, just pass the value from user's choice unchanged to
|
||||
# the engine request.
|
||||
|
||||
if hasattr(self.engine, 'language') and self.engine.language:
|
||||
params['language'] = self.engine.language
|
||||
else:
|
||||
params['language'] = search_query.lang
|
||||
|
||||
return params
|
||||
|
||||
@abstractmethod
|
||||
|
|
|
@ -51,6 +51,9 @@ class OnlineProcessor(EngineProcessor):
|
|||
super().initialize()
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
"""Returns a set of :ref:`request params <engine request online>` or ``None``
|
||||
if request is not supported.
|
||||
"""
|
||||
params = super().get_params(search_query, engine_category)
|
||||
if params is None:
|
||||
return None
|
||||
|
@ -184,11 +187,6 @@ class OnlineProcessor(EngineProcessor):
|
|||
self.handle_exception(result_container, e, suspend=True)
|
||||
self.logger.exception('CAPTCHA')
|
||||
except SearxEngineTooManyRequestsException as e:
|
||||
if "google" in self.engine_name:
|
||||
self.logger.warn(
|
||||
"Set to 'true' the use_mobile_ui parameter in the 'engines:'"
|
||||
" section of your settings.yml file if google is blocked for you."
|
||||
)
|
||||
self.handle_exception(result_container, e, suspend=True)
|
||||
self.logger.exception('Too many requests')
|
||||
except SearxEngineAccessDeniedException as e:
|
||||
|
@ -223,7 +221,7 @@ class OnlineProcessor(EngineProcessor):
|
|||
'test': ['unique_results'],
|
||||
}
|
||||
|
||||
if getattr(self.engine, 'supported_languages', []):
|
||||
if getattr(self.engine, 'traits', False):
|
||||
tests['lang_fr'] = {
|
||||
'matrix': {'query': 'paris', 'lang': 'fr'},
|
||||
'result_container': ['not_empty', ('has_language', 'fr')],
|
||||
|
|
|
@ -38,8 +38,8 @@ class OnlineCurrencyProcessor(OnlineProcessor):
|
|||
engine_type = 'online_currency'
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
"""Returns a set of *request params* or ``None`` if search query does not match
|
||||
to :py:obj:`parser_re`."""
|
||||
"""Returns a set of :ref:`request params <engine request online_currency>`
|
||||
or ``None`` if search query does not match to :py:obj:`parser_re`."""
|
||||
|
||||
params = super().get_params(search_query, engine_category)
|
||||
if params is None:
|
||||
|
|
|
@ -18,8 +18,9 @@ class OnlineDictionaryProcessor(OnlineProcessor):
|
|||
engine_type = 'online_dictionary'
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
"""Returns a set of *request params* or ``None`` if search query does not match
|
||||
to :py:obj:`parser_re`."""
|
||||
"""Returns a set of :ref:`request params <engine request online_dictionary>` or
|
||||
``None`` if search query does not match to :py:obj:`parser_re`.
|
||||
"""
|
||||
params = super().get_params(search_query, engine_category)
|
||||
if params is None:
|
||||
return None
|
||||
|
|
|
@ -20,9 +20,10 @@ class OnlineUrlSearchProcessor(OnlineProcessor):
|
|||
engine_type = 'online_url_search'
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
"""Returns a set of *request params* or ``None`` if search query does not match
|
||||
to at least one of :py:obj:`re_search_urls`.
|
||||
"""Returns a set of :ref:`request params <engine request online>` or ``None`` if
|
||||
search query does not match to :py:obj:`re_search_urls`.
|
||||
"""
|
||||
|
||||
params = super().get_params(search_query, engine_category)
|
||||
if params is None:
|
||||
return None
|
||||
|
|
|
@ -731,22 +731,9 @@ engines:
|
|||
- name: google
|
||||
engine: google
|
||||
shortcut: go
|
||||
# see https://docs.searxng.org/src/searx.engines.google.html#module-searx.engines.google
|
||||
use_mobile_ui: false
|
||||
# additional_tests:
|
||||
# android: *test_android
|
||||
|
||||
# - name: google italian
|
||||
# engine: google
|
||||
# shortcut: goit
|
||||
# use_mobile_ui: false
|
||||
# language: it
|
||||
|
||||
# - name: google mobile ui
|
||||
# engine: google
|
||||
# shortcut: gomui
|
||||
# use_mobile_ui: true
|
||||
|
||||
- name: google images
|
||||
engine: google_images
|
||||
shortcut: goi
|
||||
|
@ -1758,9 +1745,8 @@ engines:
|
|||
engine: peertube
|
||||
shortcut: ptb
|
||||
paging: true
|
||||
# https://instances.joinpeertube.org/instances
|
||||
base_url: https://peertube.biz/
|
||||
# base_url: https://tube.tardis.world/
|
||||
# alternatives see: https://instances.joinpeertube.org/instances
|
||||
# base_url: https://tube.4aem.com
|
||||
categories: videos
|
||||
disabled: true
|
||||
timeout: 6.0
|
||||
|
|
|
@ -12,13 +12,13 @@ import logging
|
|||
from base64 import b64decode
|
||||
from os.path import dirname, abspath
|
||||
|
||||
from searx.languages import language_codes as languages
|
||||
from .sxng_locales import sxng_locales
|
||||
|
||||
searx_dir = abspath(dirname(__file__))
|
||||
|
||||
logger = logging.getLogger('searx')
|
||||
OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss']
|
||||
LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages)
|
||||
SXNG_LOCALE_TAGS = ['all', 'auto'] + list(l[0] for l in sxng_locales)
|
||||
SIMPLE_STYLE = ('auto', 'light', 'dark')
|
||||
CATEGORIES_AS_TABS = {
|
||||
'general': {},
|
||||
|
@ -156,8 +156,8 @@ SCHEMA = {
|
|||
'safe_search': SettingsValue((0, 1, 2), 0),
|
||||
'autocomplete': SettingsValue(str, ''),
|
||||
'autocomplete_min': SettingsValue(int, 4),
|
||||
'default_lang': SettingsValue(tuple(LANGUAGE_CODES + ['']), ''),
|
||||
'languages': SettingSublistValue(LANGUAGE_CODES, LANGUAGE_CODES),
|
||||
'default_lang': SettingsValue(tuple(SXNG_LOCALE_TAGS + ['']), ''),
|
||||
'languages': SettingSublistValue(SXNG_LOCALE_TAGS, SXNG_LOCALE_TAGS),
|
||||
'ban_time_on_fail': SettingsValue(numbers.Real, 5),
|
||||
'max_ban_time_on_fail': SettingsValue(numbers.Real, 120),
|
||||
'suspended_times': {
|
||||
|
|
|
@ -1,73 +1,120 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# list of language codes
|
||||
# this file is generated automatically by utils/fetch_languages.py
|
||||
language_codes = (
|
||||
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
|
||||
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
|
||||
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'),
|
||||
'''List of SearXNG's locale codes.
|
||||
|
||||
This file is generated automatically by::
|
||||
|
||||
./manage pyenv.cmd searxng_extra/update/update_engine_traits.py
|
||||
'''
|
||||
|
||||
sxng_locales = (
|
||||
('ar', 'العربية', '', 'Arabic', '\U0001f310'),
|
||||
('bg', 'Български', '', 'Bulgarian', '\U0001f310'),
|
||||
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
|
||||
('ca', 'Català', '', 'Catalan', '\U0001f310'),
|
||||
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
|
||||
('cs', 'Čeština', '', 'Czech', '\U0001f310'),
|
||||
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
|
||||
('da', 'Dansk', '', 'Danish', '\U0001f310'),
|
||||
('da-DK', 'Dansk', 'Danmark', 'Danish', '\U0001f1e9\U0001f1f0'),
|
||||
('de', 'Deutsch', '', 'German', '\U0001f310'),
|
||||
('de-AT', 'Deutsch', 'Österreich', 'German', '\U0001f1e6\U0001f1f9'),
|
||||
('de-CH', 'Deutsch', 'Schweiz', 'German', '\U0001f1e8\U0001f1ed'),
|
||||
('de-DE', 'Deutsch', 'Deutschland', 'German', '\U0001f1e9\U0001f1ea'),
|
||||
('el', 'Ελληνικά', '', 'Greek', '\U0001f310'),
|
||||
('el-GR', 'Ελληνικά', 'Ελλάδα', 'Greek', '\U0001f1ec\U0001f1f7'),
|
||||
('en', 'English', '', 'English', '\U0001f310'),
|
||||
('en-AU', 'English', 'Australia', 'English', '\U0001f1e6\U0001f1fa'),
|
||||
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
|
||||
('en-GB', 'English', 'United Kingdom', 'English', '\U0001f1ec\U0001f1e7'),
|
||||
('en-IE', 'English', 'Ireland', 'English', '\U0001f1ee\U0001f1ea'),
|
||||
('en-IN', 'English', 'India', 'English', '\U0001f1ee\U0001f1f3'),
|
||||
('en-MY', 'English', 'Malaysia', 'English', '\U0001f1f2\U0001f1fe'),
|
||||
('en-NZ', 'English', 'New Zealand', 'English', '\U0001f1f3\U0001f1ff'),
|
||||
('en-PH', 'English', 'Philippines', 'English', '\U0001f1f5\U0001f1ed'),
|
||||
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
|
||||
('en-ZA', 'English', 'South Africa', 'English', '\U0001f1ff\U0001f1e6'),
|
||||
('es', 'Español', '', 'Spanish', '\U0001f310'),
|
||||
('es-AR', 'Español', 'Argentina', 'Spanish', '\U0001f1e6\U0001f1f7'),
|
||||
('es-CL', 'Español', 'Chile', 'Spanish', '\U0001f1e8\U0001f1f1'),
|
||||
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
|
||||
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
|
||||
('es-US', 'Español', 'Estados Unidos', 'Spanish', '\U0001f1fa\U0001f1f8'),
|
||||
('et', 'Eesti', '', 'Estonian', '\U0001f310'),
|
||||
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
|
||||
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'),
|
||||
('fi', 'Suomi', '', 'Finnish', '\U0001f310'),
|
||||
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
|
||||
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
|
||||
('fr', 'Français', '', 'French', '\U0001f310'),
|
||||
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
|
||||
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
||||
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
|
||||
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
|
||||
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
|
||||
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'),
|
||||
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
|
||||
('he', 'עברית', '', 'Hebrew', '\U0001f1ee\U0001f1f7'),
|
||||
('hi', 'हिन्दी', '', 'Hindi', '\U0001f310'),
|
||||
('hr', 'Hrvatski', '', 'Croatian', '\U0001f310'),
|
||||
('hu', 'Magyar', '', 'Hungarian', '\U0001f310'),
|
||||
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
|
||||
('id', 'Indonesia', '', 'Indonesian', '\U0001f310'),
|
||||
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
|
||||
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'),
|
||||
('is', 'Íslenska', '', 'Icelandic', '\U0001f310'),
|
||||
('it', 'Italiano', '', 'Italian', '\U0001f310'),
|
||||
('it-CH', 'Italiano', 'Svizzera', 'Italian', '\U0001f1e8\U0001f1ed'),
|
||||
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
|
||||
('ja', '日本語', '', 'Japanese', '\U0001f310'),
|
||||
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
|
||||
('ko', '한국어', '', 'Korean', '\U0001f310'),
|
||||
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
|
||||
('lt-LT', 'Lietuvių', 'Lietuva', 'Lithuanian', '\U0001f1f1\U0001f1f9'),
|
||||
('lv-LV', 'Latviešu', 'Latvija', 'Latvian', '\U0001f1f1\U0001f1fb'),
|
||||
('lt', 'Lietuvių', '', 'Lithuanian', '\U0001f310'),
|
||||
('lv', 'Latviešu', '', 'Latvian', '\U0001f310'),
|
||||
('nb', 'Norsk Bokmål', '', 'Norwegian Bokmål', '\U0001f310'),
|
||||
('nb-NO', 'Norsk Bokmål', 'Norge', 'Norwegian Bokmål', '\U0001f1f3\U0001f1f4'),
|
||||
('nl', 'Nederlands', '', 'Dutch', '\U0001f310'),
|
||||
('nl-BE', 'Nederlands', 'België', 'Dutch', '\U0001f1e7\U0001f1ea'),
|
||||
('nl-NL', 'Nederlands', 'Nederland', 'Dutch', '\U0001f1f3\U0001f1f1'),
|
||||
('no-NO', 'Norsk', '', 'Norwegian (Bokmål)', '\U0001f1f3\U0001f1f4'),
|
||||
('pl', 'Polski', '', 'Polish', '\U0001f310'),
|
||||
('pl-PL', 'Polski', 'Polska', 'Polish', '\U0001f1f5\U0001f1f1'),
|
||||
('pt', 'Português', '', 'Portuguese', '\U0001f310'),
|
||||
('pt-BR', 'Português', 'Brasil', 'Portuguese', '\U0001f1e7\U0001f1f7'),
|
||||
('pt-PT', 'Português', 'Portugal', 'Portuguese', '\U0001f1f5\U0001f1f9'),
|
||||
('ro', 'Română', '', 'Romanian', '\U0001f310'),
|
||||
('ro-RO', 'Română', 'România', 'Romanian', '\U0001f1f7\U0001f1f4'),
|
||||
('ru', 'Русский', '', 'Russian', '\U0001f310'),
|
||||
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
|
||||
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
|
||||
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
|
||||
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'),
|
||||
('sk', 'Slovenčina', '', 'Slovak', '\U0001f310'),
|
||||
('sl', 'Slovenščina', '', 'Slovenian', '\U0001f310'),
|
||||
('sr', 'Српски', '', 'Serbian', '\U0001f310'),
|
||||
('sv', 'Svenska', '', 'Swedish', '\U0001f310'),
|
||||
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
|
||||
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'),
|
||||
('th', 'ไทย', '', 'Thai', '\U0001f310'),
|
||||
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
|
||||
('tr', 'Türkçe', '', 'Turkish', '\U0001f310'),
|
||||
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
|
||||
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
|
||||
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'),
|
||||
('uk', 'Українська', '', 'Ukrainian', '\U0001f310'),
|
||||
('vi', 'Tiếng Việt', '', 'Vietnamese', '\U0001f310'),
|
||||
('zh', '中文', '', 'Chinese', '\U0001f310'),
|
||||
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
|
||||
('zh-HK', '中文', '中國香港', 'Chinese', '\U0001f1ed\U0001f1f0'),
|
||||
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),
|
||||
('zh-TW', '中文', '台灣', 'Chinese', '\U0001f1f9\U0001f1fc'),
|
||||
)
|
||||
'''
|
||||
A list of five-digit tuples:
|
||||
|
||||
0. SearXNG's internal locale tag (a language or region tag)
|
||||
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
|
||||
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
|
||||
Empty string for language tags.
|
||||
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
|
||||
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
|
||||
are represented by a globe (🌐)
|
||||
|
||||
.. code:: python
|
||||
|
||||
('en', 'English', '', 'English', '🌐'),
|
||||
('en-CA', 'English', 'Canada', 'English', '🇨🇦'),
|
||||
('en-US', 'English', 'United States', 'English', '🇺🇸'),
|
||||
..
|
||||
('fr', 'Français', '', 'French', '🌐'),
|
||||
('fr-BE', 'Français', 'Belgique', 'French', '🇧🇪'),
|
||||
('fr-CA', 'Français', 'Canada', 'French', '🇨🇦'),
|
||||
|
||||
:meta hide-value:
|
||||
'''
|
|
@ -1,12 +1,12 @@
|
|||
<select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}}
|
||||
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
|
||||
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option>
|
||||
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>
|
||||
{{- _('Auto-detect') -}}
|
||||
{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%}
|
||||
</option>
|
||||
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
|
||||
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>
|
||||
{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}
|
||||
{%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%}
|
||||
<option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}>
|
||||
{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}]
|
||||
</option>
|
||||
{%- endfor -%}
|
||||
</select>
|
||||
|
|
|
@ -115,10 +115,10 @@
|
|||
<legend id="pref_language">{{ _('Search language') }}</legend>
|
||||
<p class="value">{{- '' -}}
|
||||
<select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}}
|
||||
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
|
||||
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option>
|
||||
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
|
||||
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option>
|
||||
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option>
|
||||
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }} [auto]</option>
|
||||
{%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%}
|
||||
<option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}]</option>
|
||||
{%- endfor -%}
|
||||
</select>{{- '' -}}
|
||||
</p>
|
||||
|
|
|
@ -18,13 +18,11 @@ from urllib.parse import urljoin, urlparse
|
|||
|
||||
from lxml import html
|
||||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
||||
from babel.core import get_global
|
||||
|
||||
|
||||
from searx import settings
|
||||
from searx.data import USER_AGENTS, data_dir
|
||||
from searx.version import VERSION_TAG
|
||||
from searx.languages import language_codes
|
||||
from searx.sxng_locales import sxng_locales
|
||||
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
||||
from searx import logger
|
||||
|
||||
|
@ -53,8 +51,8 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
|
|||
_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
|
||||
"""fasttext model to predict laguage of a search term"""
|
||||
|
||||
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes])
|
||||
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
|
||||
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
|
||||
"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
|
||||
|
||||
|
||||
class _NotSetClass: # pylint: disable=too-few-public-methods
|
||||
|
@ -355,102 +353,16 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
|
|||
is_abbr = len(lang) == 2
|
||||
lang = lang.lower()
|
||||
if is_abbr:
|
||||
for l in language_codes:
|
||||
for l in sxng_locales:
|
||||
if l[0][:2] == lang:
|
||||
return (True, l[0][:2], l[3].lower())
|
||||
return None
|
||||
for l in language_codes:
|
||||
for l in sxng_locales:
|
||||
if l[1].lower() == lang or l[3].lower() == lang:
|
||||
return (True, l[0][:2], l[3].lower())
|
||||
return None
|
||||
|
||||
|
||||
def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]:
|
||||
key = str(lang_list)
|
||||
value = _LANG_TO_LC_CACHE.get(key, None)
|
||||
if value is None:
|
||||
value = {}
|
||||
for lang in lang_list:
|
||||
value.setdefault(lang.split('-')[0], lang)
|
||||
_LANG_TO_LC_CACHE[key] = value
|
||||
return value
|
||||
|
||||
|
||||
# babel's get_global contains all sorts of miscellaneous locale and territory related data
|
||||
# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py
|
||||
def _get_from_babel(lang_code: str, key):
|
||||
match = get_global(key).get(lang_code.replace('-', '_'))
|
||||
# for some keys, such as territory_aliases, match may be a list
|
||||
if isinstance(match, str):
|
||||
return match.replace('_', '-')
|
||||
return match
|
||||
|
||||
|
||||
def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]: # pylint: disable=W0102
|
||||
"""auxiliary function to match lang_code in lang_list"""
|
||||
# replace language code with a custom alias if necessary
|
||||
if lang_code in custom_aliases:
|
||||
lang_code = custom_aliases[lang_code]
|
||||
|
||||
if lang_code in lang_list:
|
||||
return lang_code
|
||||
|
||||
# try to get the most likely country for this language
|
||||
subtags = _get_from_babel(lang_code, 'likely_subtags')
|
||||
if subtags:
|
||||
if subtags in lang_list:
|
||||
return subtags
|
||||
subtag_parts = subtags.split('-')
|
||||
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
|
||||
if new_code in custom_aliases:
|
||||
new_code = custom_aliases[new_code]
|
||||
if new_code in lang_list:
|
||||
return new_code
|
||||
|
||||
# try to get the any supported country for this language
|
||||
return _get_lang_to_lc_dict(lang_list).get(lang_code)
|
||||
|
||||
|
||||
def match_language( # pylint: disable=W0102
|
||||
locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US'
|
||||
) -> Optional[str]:
|
||||
"""get the language code from lang_list that best matches locale_code"""
|
||||
# try to get language from given locale_code
|
||||
language = _match_language(locale_code, lang_list, custom_aliases)
|
||||
if language:
|
||||
return language
|
||||
|
||||
locale_parts = locale_code.split('-')
|
||||
lang_code = locale_parts[0]
|
||||
|
||||
# if locale_code has script, try matching without it
|
||||
if len(locale_parts) > 2:
|
||||
language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases)
|
||||
if language:
|
||||
return language
|
||||
|
||||
# try to get language using an equivalent country code
|
||||
if len(locale_parts) > 1:
|
||||
country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases')
|
||||
if country_alias:
|
||||
language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
|
||||
if language:
|
||||
return language
|
||||
|
||||
# try to get language using an equivalent language code
|
||||
alias = _get_from_babel(lang_code, 'language_aliases')
|
||||
if alias:
|
||||
language = _match_language(alias, lang_list, custom_aliases)
|
||||
if language:
|
||||
return language
|
||||
|
||||
if lang_code != locale_code:
|
||||
# try to get language from given language without giving the country
|
||||
language = _match_language(lang_code, lang_list, custom_aliases)
|
||||
|
||||
return language or fallback
|
||||
|
||||
|
||||
def load_module(filename: str, module_dir: str) -> types.ModuleType:
|
||||
modname = splitext(filename)[0]
|
||||
modpath = join(module_dir, filename)
|
||||
|
|
|
@ -89,7 +89,6 @@ from searx.utils import (
|
|||
html_to_text,
|
||||
gen_useragent,
|
||||
dict_subset,
|
||||
match_language,
|
||||
)
|
||||
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
|
||||
from searx.query import RawTextQuery
|
||||
|
@ -117,12 +116,13 @@ from searx.locales import (
|
|||
RTL_LOCALES,
|
||||
localeselector,
|
||||
locales_initialize,
|
||||
match_locale,
|
||||
)
|
||||
|
||||
# renaming names from searx imports ...
|
||||
from searx.autocomplete import search_autocomplete, backends as autocomplete_backends
|
||||
from searx.languages import language_codes as languages
|
||||
from searx.redisdb import initialize as redis_initialize
|
||||
from searx.sxng_locales import sxng_locales
|
||||
from searx.search import SearchWithPlugins, initialize as search_initialize
|
||||
from searx.network import stream as http_stream, set_context_network_name
|
||||
from searx.search.checker import get_result as checker_get_result
|
||||
|
@ -227,7 +227,7 @@ def _get_browser_language(req, lang_list):
|
|||
if '-' in lang:
|
||||
lang_parts = lang.split('-')
|
||||
lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper())
|
||||
locale = match_language(lang, lang_list, fallback=None)
|
||||
locale = match_locale(lang, lang_list, fallback=None)
|
||||
if locale is not None:
|
||||
return locale
|
||||
return 'en'
|
||||
|
@ -407,7 +407,7 @@ def get_client_settings():
|
|||
|
||||
|
||||
def render(template_name: str, **kwargs):
|
||||
|
||||
# pylint: disable=too-many-statements
|
||||
kwargs['client_settings'] = str(
|
||||
base64.b64encode(
|
||||
bytes(
|
||||
|
@ -438,17 +438,20 @@ def render(template_name: str, **kwargs):
|
|||
kwargs['OTHER_CATEGORY'] = OTHER_CATEGORY
|
||||
|
||||
# i18n
|
||||
kwargs['language_codes'] = [l for l in languages if l[0] in settings['search']['languages']]
|
||||
kwargs['sxng_locales'] = [l for l in sxng_locales if l[0] in settings['search']['languages']]
|
||||
|
||||
locale = request.preferences.get_value('locale')
|
||||
kwargs['locale_rfc5646'] = _get_locale_rfc5646(locale)
|
||||
|
||||
if locale in RTL_LOCALES and 'rtl' not in kwargs:
|
||||
kwargs['rtl'] = True
|
||||
|
||||
if 'current_language' not in kwargs:
|
||||
kwargs['current_language'] = match_language(
|
||||
request.preferences.get_value('language'), settings['search']['languages']
|
||||
)
|
||||
_locale = request.preferences.get_value('language')
|
||||
if _locale in ('auto', 'all'):
|
||||
kwargs['current_language'] = _locale
|
||||
else:
|
||||
kwargs['current_language'] = match_locale(_locale, settings['search']['languages'])
|
||||
|
||||
# values from settings
|
||||
kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html']
|
||||
|
@ -810,6 +813,13 @@ def search():
|
|||
)
|
||||
)
|
||||
|
||||
if search_query.lang in ('auto', 'all'):
|
||||
current_language = search_query.lang
|
||||
else:
|
||||
current_language = match_locale(
|
||||
search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language")
|
||||
)
|
||||
|
||||
# search_query.lang contains the user choice (all, auto, en, ...)
|
||||
# when the user choice is "auto", search.search_query.lang contains the detected language
|
||||
# otherwise it is equals to search_query.lang
|
||||
|
@ -832,12 +842,8 @@ def search():
|
|||
result_container.unresponsive_engines
|
||||
),
|
||||
current_locale = request.preferences.get_value("locale"),
|
||||
current_language = match_language(
|
||||
search_query.lang,
|
||||
settings['search']['languages'],
|
||||
fallback=request.preferences.get_value("language")
|
||||
),
|
||||
search_language = match_language(
|
||||
current_language = current_language,
|
||||
search_language = match_locale(
|
||||
search.search_query.lang,
|
||||
settings['search']['languages'],
|
||||
fallback=request.preferences.get_value("language")
|
||||
|
@ -907,16 +913,11 @@ def autocompleter():
|
|||
# and there is a query part
|
||||
if len(raw_text_query.autocomplete_list) == 0 and len(sug_prefix) > 0:
|
||||
|
||||
# get language from cookie
|
||||
language = request.preferences.get_value('language')
|
||||
if not language or language == 'all':
|
||||
language = 'en'
|
||||
else:
|
||||
language = language.split('-')[0]
|
||||
# get SearXNG's locale and autocomplete backend from cookie
|
||||
sxng_locale = request.preferences.get_value('language')
|
||||
backend_name = request.preferences.get_value('autocomplete')
|
||||
|
||||
# run autocompletion
|
||||
raw_results = search_autocomplete(request.preferences.get_value('autocomplete'), sug_prefix, language)
|
||||
for result in raw_results:
|
||||
for result in search_autocomplete(backend_name, sug_prefix, sxng_locale):
|
||||
# attention: this loop will change raw_text_query object and this is
|
||||
# the reason why the sug_prefix was stored before (see above)
|
||||
if result != sug_prefix:
|
||||
|
@ -1001,7 +1002,9 @@ def preferences():
|
|||
'rate80': rate80,
|
||||
'rate95': rate95,
|
||||
'warn_timeout': e.timeout > settings['outgoing']['request_timeout'],
|
||||
'supports_selected_language': _is_selected_language_supported(e, request.preferences),
|
||||
'supports_selected_language': e.traits.is_locale_supported(
|
||||
str(request.preferences.get_value('language') or 'all')
|
||||
),
|
||||
'result_count': result_count,
|
||||
}
|
||||
# end of stats
|
||||
|
@ -1052,7 +1055,9 @@ def preferences():
|
|||
# supports
|
||||
supports = {}
|
||||
for _, e in filtered_engines.items():
|
||||
supports_selected_language = _is_selected_language_supported(e, request.preferences)
|
||||
supports_selected_language = e.traits.is_locale_supported(
|
||||
str(request.preferences.get_value('language') or 'all')
|
||||
)
|
||||
safesearch = e.safesearch
|
||||
time_range_support = e.time_range_support
|
||||
for checker_test_name in checker_results.get(e.name, {}).get('errors', {}):
|
||||
|
@ -1099,16 +1104,6 @@ def preferences():
|
|||
)
|
||||
|
||||
|
||||
def _is_selected_language_supported(engine, preferences: Preferences): # pylint: disable=redefined-outer-name
|
||||
language = preferences.get_value('language')
|
||||
if language == 'all':
|
||||
return True
|
||||
x = match_language(
|
||||
language, getattr(engine, 'supported_languages', []), getattr(engine, 'language_aliases', {}), None
|
||||
)
|
||||
return bool(x)
|
||||
|
||||
|
||||
@app.route('/image_proxy', methods=['GET'])
|
||||
def image_proxy():
|
||||
# pylint: disable=too-many-return-statements, too-many-branches
|
||||
|
@ -1327,10 +1322,7 @@ def config():
|
|||
if not request.preferences.validate_token(engine):
|
||||
continue
|
||||
|
||||
supported_languages = engine.supported_languages
|
||||
if isinstance(engine.supported_languages, dict):
|
||||
supported_languages = list(engine.supported_languages.keys())
|
||||
|
||||
_languages = engine.traits.languages.keys()
|
||||
_engines.append(
|
||||
{
|
||||
'name': name,
|
||||
|
@ -1339,7 +1331,8 @@ def config():
|
|||
'enabled': not engine.disabled,
|
||||
'paging': engine.paging,
|
||||
'language_support': engine.language_support,
|
||||
'supported_languages': supported_languages,
|
||||
'languages': list(_languages),
|
||||
'regions': list(engine.traits.regions.keys()),
|
||||
'safesearch': engine.safesearch,
|
||||
'time_range_support': engine.time_range_support,
|
||||
'timeout': engine.timeout,
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import csv
|
||||
|
@ -8,7 +10,7 @@ import re
|
|||
import inspect
|
||||
import itertools
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Iterable, List, Tuple, Dict
|
||||
from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING
|
||||
|
||||
from io import StringIO
|
||||
from codecs import getincrementalencoder
|
||||
|
@ -16,7 +18,10 @@ from codecs import getincrementalencoder
|
|||
from flask_babel import gettext, format_date
|
||||
|
||||
from searx import logger, settings
|
||||
from searx.engines import Engine, OTHER_CATEGORY
|
||||
from searx.engines import OTHER_CATEGORY
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from searx.enginelib import Engine
|
||||
|
||||
|
||||
VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$')
|
||||
|
|
|
@ -18,8 +18,8 @@ from os.path import join
|
|||
from lxml.html import fromstring
|
||||
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.utils import extract_text, match_language
|
||||
from searx.locales import LOCALE_NAMES, locales_initialize
|
||||
from searx.utils import extract_text
|
||||
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
|
||||
from searx import searx_dir
|
||||
from searx.utils import gen_useragent, detect_language
|
||||
import searx.search
|
||||
|
@ -225,9 +225,9 @@ def fetch_website_description(engine_name, website):
|
|||
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
|
||||
if fetched_lang is None or desc is None:
|
||||
continue
|
||||
matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None)
|
||||
matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None)
|
||||
if matched_lang is None:
|
||||
fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
|
||||
fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
|
||||
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
|
||||
if matched_lang is not None:
|
||||
update_description(engine_name, matched_lang, desc, website, replace=False)
|
||||
|
|
198
searxng_extra/update/update_engine_traits.py
Executable file
198
searxng_extra/update/update_engine_traits.py
Executable file
|
@ -0,0 +1,198 @@
|
|||
#!/usr/bin/env python
|
||||
# lint: pylint
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
|
||||
|
||||
:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
|
||||
Persistence of engines traits, fetched from the engines.
|
||||
|
||||
:origin:`searx/languages.py`
|
||||
Is generated from intersecting each engine's supported traits.
|
||||
|
||||
The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
|
||||
the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
from unicodedata import lookup
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
import babel
|
||||
|
||||
from searx import settings, searx_dir
|
||||
from searx import network
|
||||
from searx.engines import load_engines
|
||||
from searx.enginelib.traits import EngineTraitsMap
|
||||
|
||||
# Output files.
|
||||
languages_file = Path(searx_dir) / 'sxng_locales.py'
|
||||
languages_file_header = """\
|
||||
# -*- coding: utf-8 -*-
|
||||
'''List of SearXNG's locale codes.
|
||||
|
||||
This file is generated automatically by::
|
||||
|
||||
./manage pyenv.cmd searxng_extra/update/update_engine_traits.py
|
||||
'''
|
||||
|
||||
sxng_locales = (
|
||||
"""
|
||||
languages_file_footer = """,
|
||||
)
|
||||
'''
|
||||
A list of five-digit tuples:
|
||||
|
||||
0. SearXNG's internal locale tag (a language or region tag)
|
||||
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
|
||||
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
|
||||
Empty string for language tags.
|
||||
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
|
||||
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
|
||||
are represented by a globe (\U0001F310)
|
||||
|
||||
.. code:: python
|
||||
|
||||
('en', 'English', '', 'English', '\U0001f310'),
|
||||
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
|
||||
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
|
||||
..
|
||||
('fr', 'Français', '', 'French', '\U0001f310'),
|
||||
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
|
||||
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
||||
|
||||
:meta hide-value:
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
lang2emoji = {
|
||||
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
|
||||
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
|
||||
'jp': '\U0001F1EF\U0001F1F5', # Japanese
|
||||
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
|
||||
'he': '\U0001F1EE\U0001F1F7', # Hebrew
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
load_engines(settings['engines'])
|
||||
# traits_map = EngineTraitsMap.from_data()
|
||||
traits_map = fetch_traits_map()
|
||||
sxng_tag_list = filter_locales(traits_map)
|
||||
write_languages_file(sxng_tag_list)
|
||||
|
||||
|
||||
def fetch_traits_map():
|
||||
"""Fetchs supported languages for each engine and writes json file with those."""
|
||||
network.set_timeout_for_thread(10.0)
|
||||
|
||||
def log(msg):
|
||||
print(msg)
|
||||
|
||||
traits_map = EngineTraitsMap.fetch_traits(log=log)
|
||||
print("fetched properties from %s engines" % len(traits_map))
|
||||
print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
|
||||
traits_map.save_data()
|
||||
return traits_map
|
||||
|
||||
|
||||
def filter_locales(traits_map: EngineTraitsMap):
|
||||
"""Filter language & region tags by a threshold."""
|
||||
|
||||
min_eng_per_region = 11
|
||||
min_eng_per_lang = 13
|
||||
|
||||
_ = {}
|
||||
for eng in traits_map.values():
|
||||
for reg in eng.regions.keys():
|
||||
_[reg] = _.get(reg, 0) + 1
|
||||
|
||||
regions = set(k for k, v in _.items() if v >= min_eng_per_region)
|
||||
lang_from_region = set(k.split('-')[0] for k in regions)
|
||||
|
||||
_ = {}
|
||||
for eng in traits_map.values():
|
||||
for lang in eng.languages.keys():
|
||||
# ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
|
||||
# already counted by existence of 'zh' or 'sr', 'pa')
|
||||
if '_' in lang:
|
||||
# print("ignore %s" % lang)
|
||||
continue
|
||||
_[lang] = _.get(lang, 0) + 1
|
||||
|
||||
languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
|
||||
|
||||
sxng_tag_list = set()
|
||||
sxng_tag_list.update(regions)
|
||||
sxng_tag_list.update(lang_from_region)
|
||||
sxng_tag_list.update(languages)
|
||||
|
||||
return sxng_tag_list
|
||||
|
||||
|
||||
def write_languages_file(sxng_tag_list):
|
||||
|
||||
language_codes = []
|
||||
|
||||
for sxng_tag in sorted(sxng_tag_list):
|
||||
sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
|
||||
|
||||
flag = get_unicode_flag(sxng_locale) or ''
|
||||
|
||||
item = (
|
||||
sxng_tag,
|
||||
sxng_locale.get_language_name().title(),
|
||||
sxng_locale.get_territory_name() or '',
|
||||
sxng_locale.english_name.split(' (')[0],
|
||||
UnicodeEscape(flag),
|
||||
)
|
||||
|
||||
language_codes.append(item)
|
||||
|
||||
language_codes = tuple(language_codes)
|
||||
|
||||
with open(languages_file, 'w', encoding='utf-8') as new_file:
|
||||
file_content = "{header} {language_codes}{footer}".format(
|
||||
header=languages_file_header,
|
||||
language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
|
||||
footer=languages_file_footer,
|
||||
)
|
||||
new_file.write(file_content)
|
||||
new_file.close()
|
||||
|
||||
|
||||
class UnicodeEscape(str):
|
||||
"""Escape unicode string in :py:obj:`pprint.pformat`"""
|
||||
|
||||
def __repr__(self):
|
||||
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
|
||||
|
||||
|
||||
def get_unicode_flag(locale: babel.Locale):
|
||||
"""Determine a unicode flag (emoji) that fits to the ``locale``"""
|
||||
|
||||
emoji = lang2emoji.get(locale.language)
|
||||
if emoji:
|
||||
return emoji
|
||||
|
||||
if not locale.territory:
|
||||
return '\U0001F310'
|
||||
|
||||
emoji = lang2emoji.get(locale.territory.lower())
|
||||
if emoji:
|
||||
return emoji
|
||||
|
||||
try:
|
||||
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
|
||||
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
|
||||
# print("OK : %s --> %s%s" % (locale, c1, c2))
|
||||
except KeyError as exc:
|
||||
print("ERROR: %s --> %s" % (locale, exc))
|
||||
return None
|
||||
|
||||
return c1 + c2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,313 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# lint: pylint
|
||||
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""This script generates languages.py from intersecting each engine's supported
|
||||
languages.
|
||||
|
||||
Output files: :origin:`searx/data/engines_languages.json` and
|
||||
:origin:`searx/languages.py` (:origin:`CI Update data ...
|
||||
<.github/workflows/data-update.yml>`).
|
||||
|
||||
"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
from unicodedata import lookup
|
||||
import json
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
from babel import Locale, UnknownLocaleError
|
||||
from babel.languages import get_global
|
||||
from babel.core import parse_locale
|
||||
|
||||
from searx import settings, searx_dir
|
||||
from searx.engines import load_engines, engines
|
||||
from searx.network import set_timeout_for_thread
|
||||
|
||||
# Output files.
|
||||
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
|
||||
languages_file = Path(searx_dir) / 'languages.py'
|
||||
|
||||
|
||||
# Fetches supported languages for each engine and writes json file with those.
|
||||
def fetch_supported_languages():
|
||||
set_timeout_for_thread(10.0)
|
||||
|
||||
engines_languages = {}
|
||||
names = list(engines)
|
||||
names.sort()
|
||||
|
||||
for engine_name in names:
|
||||
if hasattr(engines[engine_name], 'fetch_supported_languages'):
|
||||
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
|
||||
print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name))
|
||||
if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck
|
||||
engines_languages[engine_name] = sorted(engines_languages[engine_name])
|
||||
|
||||
print("fetched languages from %s engines" % len(engines_languages))
|
||||
|
||||
# write json file
|
||||
with open(engines_languages_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(engines_languages, f, indent=2, sort_keys=True)
|
||||
|
||||
return engines_languages
|
||||
|
||||
|
||||
# Get babel Locale object from lang_code if possible.
|
||||
def get_locale(lang_code):
|
||||
try:
|
||||
locale = Locale.parse(lang_code, sep='-')
|
||||
return locale
|
||||
except (UnknownLocaleError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
lang2emoji = {
|
||||
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
|
||||
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
|
||||
'jp': '\U0001F1EF\U0001F1F5', # Japanese
|
||||
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
|
||||
'he': '\U0001F1EE\U0001F1F7', # Hebrew
|
||||
}
|
||||
|
||||
|
||||
def get_unicode_flag(lang_code):
|
||||
"""Determine a unicode flag (emoji) that fits to the ``lang_code``"""
|
||||
|
||||
emoji = lang2emoji.get(lang_code.lower())
|
||||
if emoji:
|
||||
return emoji
|
||||
|
||||
if len(lang_code) == 2:
|
||||
return '\U0001F310'
|
||||
|
||||
language = territory = script = variant = ''
|
||||
try:
|
||||
language, territory, script, variant = parse_locale(lang_code, '-')
|
||||
except ValueError as exc:
|
||||
print(exc)
|
||||
|
||||
# https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
|
||||
if not territory:
|
||||
# https://www.unicode.org/emoji/charts/emoji-list.html#country-flag
|
||||
emoji = lang2emoji.get(language)
|
||||
if not emoji:
|
||||
print(
|
||||
"%s --> language: %s / territory: %s / script: %s / variant: %s"
|
||||
% (lang_code, language, territory, script, variant)
|
||||
)
|
||||
return emoji
|
||||
|
||||
emoji = lang2emoji.get(territory.lower())
|
||||
if emoji:
|
||||
return emoji
|
||||
|
||||
try:
|
||||
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[0])
|
||||
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[1])
|
||||
# print("%s --> territory: %s --> %s%s" %(lang_code, territory, c1, c2 ))
|
||||
except KeyError as exc:
|
||||
print("%s --> territory: %s --> %s" % (lang_code, territory, exc))
|
||||
return None
|
||||
|
||||
return c1 + c2
|
||||
|
||||
|
||||
def get_territory_name(lang_code):
|
||||
country_name = None
|
||||
locale = get_locale(lang_code)
|
||||
try:
|
||||
if locale is not None:
|
||||
country_name = locale.get_territory_name()
|
||||
except FileNotFoundError as exc:
|
||||
print("ERROR: %s --> %s" % (locale, exc))
|
||||
return country_name
|
||||
|
||||
|
||||
# Join all language lists.
|
||||
def join_language_lists(engines_languages):
|
||||
language_list = {}
|
||||
for engine_name in engines_languages:
|
||||
for lang_code in engines_languages[engine_name]:
|
||||
|
||||
# apply custom fixes if necessary
|
||||
if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
|
||||
lang_code = next(
|
||||
lc for lc, alias in engines[engine_name].language_aliases.items() if lang_code == alias
|
||||
)
|
||||
|
||||
locale = get_locale(lang_code)
|
||||
|
||||
# ensure that lang_code uses standard language and country codes
|
||||
if locale and locale.territory:
|
||||
lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
|
||||
short_code = lang_code.split('-')[0]
|
||||
|
||||
# add language without country if not in list
|
||||
if short_code not in language_list:
|
||||
if locale:
|
||||
# get language's data from babel's Locale object
|
||||
language_name = locale.get_language_name().title()
|
||||
english_name = locale.english_name.split(' (')[0]
|
||||
elif short_code in engines_languages['wikipedia']:
|
||||
# get language's data from wikipedia if not known by babel
|
||||
language_name = engines_languages['wikipedia'][short_code]['name']
|
||||
english_name = engines_languages['wikipedia'][short_code]['english_name']
|
||||
else:
|
||||
language_name = None
|
||||
english_name = None
|
||||
|
||||
# add language to list
|
||||
language_list[short_code] = {
|
||||
'name': language_name,
|
||||
'english_name': english_name,
|
||||
'counter': set(),
|
||||
'countries': {},
|
||||
}
|
||||
|
||||
# add language with country if not in list
|
||||
if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
|
||||
country_name = ''
|
||||
if locale:
|
||||
# get country name from babel's Locale object
|
||||
try:
|
||||
country_name = locale.get_territory_name()
|
||||
except FileNotFoundError as exc:
|
||||
print("ERROR: %s --> %s" % (locale, exc))
|
||||
locale = None
|
||||
|
||||
language_list[short_code]['countries'][lang_code] = {
|
||||
'country_name': country_name,
|
||||
'counter': set(),
|
||||
}
|
||||
|
||||
# count engine for both language_country combination and language alone
|
||||
language_list[short_code]['counter'].add(engine_name)
|
||||
if lang_code != short_code:
|
||||
language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
|
||||
|
||||
return language_list
|
||||
|
||||
|
||||
# Filter language list so it only includes the most supported languages and countries
|
||||
def filter_language_list(all_languages):
|
||||
min_engines_per_lang = 12
|
||||
min_engines_per_country = 7
|
||||
# pylint: disable=consider-using-dict-items, consider-iterating-dictionary
|
||||
main_engines = [
|
||||
engine_name
|
||||
for engine_name in engines.keys()
|
||||
if 'general' in engines[engine_name].categories
|
||||
and engines[engine_name].supported_languages
|
||||
and not engines[engine_name].disabled
|
||||
]
|
||||
|
||||
# filter list to include only languages supported by most engines or all default general engines
|
||||
filtered_languages = {
|
||||
code: lang
|
||||
for code, lang in all_languages.items()
|
||||
if (
|
||||
len(lang['counter']) >= min_engines_per_lang
|
||||
or all(main_engine in lang['counter'] for main_engine in main_engines)
|
||||
)
|
||||
}
|
||||
|
||||
def _copy_lang_data(lang, country_name=None):
|
||||
new_dict = {}
|
||||
new_dict['name'] = all_languages[lang]['name']
|
||||
new_dict['english_name'] = all_languages[lang]['english_name']
|
||||
if country_name:
|
||||
new_dict['country_name'] = country_name
|
||||
return new_dict
|
||||
|
||||
# for each language get country codes supported by most engines or at least one country code
|
||||
filtered_languages_with_countries = {}
|
||||
for lang, lang_data in filtered_languages.items():
|
||||
countries = lang_data['countries']
|
||||
filtered_countries = {}
|
||||
|
||||
# get language's country codes with enough supported engines
|
||||
for lang_country, country_data in countries.items():
|
||||
if len(country_data['counter']) >= min_engines_per_country:
|
||||
filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
|
||||
|
||||
# add language without countries too if there's more than one country to choose from
|
||||
if len(filtered_countries) > 1:
|
||||
filtered_countries[lang] = _copy_lang_data(lang, None)
|
||||
elif len(filtered_countries) == 1:
|
||||
lang_country = next(iter(filtered_countries))
|
||||
|
||||
# if no country has enough engines try to get most likely country code from babel
|
||||
if not filtered_countries:
|
||||
lang_country = None
|
||||
subtags = get_global('likely_subtags').get(lang)
|
||||
if subtags:
|
||||
country_code = subtags.split('_')[-1]
|
||||
if len(country_code) == 2:
|
||||
lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
|
||||
|
||||
if lang_country:
|
||||
filtered_countries[lang_country] = _copy_lang_data(lang, None)
|
||||
else:
|
||||
filtered_countries[lang] = _copy_lang_data(lang, None)
|
||||
|
||||
filtered_languages_with_countries.update(filtered_countries)
|
||||
|
||||
return filtered_languages_with_countries
|
||||
|
||||
|
||||
class UnicodeEscape(str):
|
||||
"""Escape unicode string in :py:obj:`pprint.pformat`"""
|
||||
|
||||
def __repr__(self):
|
||||
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
|
||||
|
||||
|
||||
# Write languages.py.
|
||||
def write_languages_file(languages):
|
||||
file_headers = (
|
||||
"# -*- coding: utf-8 -*-",
|
||||
"# list of language codes",
|
||||
"# this file is generated automatically by utils/fetch_languages.py",
|
||||
"language_codes = (\n",
|
||||
)
|
||||
|
||||
language_codes = []
|
||||
|
||||
for code in sorted(languages):
|
||||
|
||||
name = languages[code]['name']
|
||||
if name is None:
|
||||
print("ERROR: languages['%s'] --> %s" % (code, languages[code]))
|
||||
continue
|
||||
|
||||
flag = get_unicode_flag(code) or ''
|
||||
item = (
|
||||
code,
|
||||
languages[code]['name'].split(' (')[0],
|
||||
get_territory_name(code) or '',
|
||||
languages[code].get('english_name') or '',
|
||||
UnicodeEscape(flag),
|
||||
)
|
||||
|
||||
language_codes.append(item)
|
||||
|
||||
language_codes = tuple(language_codes)
|
||||
|
||||
with open(languages_file, 'w', encoding='utf-8') as new_file:
|
||||
file_content = "{file_headers} {language_codes},\n)\n".format(
|
||||
# fmt: off
|
||||
file_headers = '\n'.join(file_headers),
|
||||
language_codes = pformat(language_codes, indent=4)[1:-1]
|
||||
# fmt: on
|
||||
)
|
||||
new_file.write(file_content)
|
||||
new_file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_engines(settings['engines'])
|
||||
_engines_languages = fetch_supported_languages()
|
||||
_all_languages = join_language_lists(_engines_languages)
|
||||
_filtered_languages = filter_language_list(_all_languages)
|
||||
write_languages_file(_filtered_languages)
|
|
@ -50,7 +50,7 @@ from pathlib import Path
|
|||
from searx import searx_dir
|
||||
from searx.network import set_timeout_for_thread
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.languages import language_codes
|
||||
from searx.sxng_locales import sxng_locales
|
||||
from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK
|
||||
|
||||
set_loggers(wikidata, 'wikidata')
|
||||
|
@ -76,7 +76,7 @@ GROUP BY ?key ?item ?itemLabel
|
|||
ORDER BY ?key ?item ?itemLabel
|
||||
"""
|
||||
|
||||
LANGUAGES = [l[0].lower() for l in language_codes]
|
||||
LANGUAGES = [l[0].lower() for l in sxng_locales]
|
||||
|
||||
PRESET_KEYS = {
|
||||
('wikidata',): {'en': 'Wikidata'},
|
||||
|
|
111
tests/unit/test_locales.py
Normal file
111
tests/unit/test_locales.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Test some code from module :py:obj:`searx.locales`"""
|
||||
|
||||
from searx import locales
|
||||
from searx.sxng_locales import sxng_locales
|
||||
from tests import SearxTestCase
|
||||
|
||||
|
||||
class TestLocales(SearxTestCase):
|
||||
"""Implemented tests:
|
||||
|
||||
- :py:obj:`searx.locales.match_locale`
|
||||
"""
|
||||
|
||||
def test_match_locale(self):
|
||||
|
||||
locale_tag_list = [x[0] for x in sxng_locales]
|
||||
|
||||
# Test SearXNG search languages
|
||||
|
||||
self.assertEqual(locales.match_locale('de', locale_tag_list), 'de')
|
||||
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr')
|
||||
self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh')
|
||||
|
||||
# Test SearXNG search regions
|
||||
|
||||
self.assertEqual(locales.match_locale('ca-es', locale_tag_list), 'ca-ES')
|
||||
self.assertEqual(locales.match_locale('de-at', locale_tag_list), 'de-AT')
|
||||
self.assertEqual(locales.match_locale('de-de', locale_tag_list), 'de-DE')
|
||||
self.assertEqual(locales.match_locale('en-UK', locale_tag_list), 'en-GB')
|
||||
self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
|
||||
self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
|
||||
self.assertEqual(locales.match_locale('fr-ca', locale_tag_list), 'fr-CA')
|
||||
self.assertEqual(locales.match_locale('fr-ch', locale_tag_list), 'fr-CH')
|
||||
self.assertEqual(locales.match_locale('zh-cn', locale_tag_list), 'zh-CN')
|
||||
self.assertEqual(locales.match_locale('zh-tw', locale_tag_list), 'zh-TW')
|
||||
self.assertEqual(locales.match_locale('zh-hk', locale_tag_list), 'zh-HK')
|
||||
|
||||
# Test language script code
|
||||
|
||||
self.assertEqual(locales.match_locale('zh-hans', locale_tag_list), 'zh-CN')
|
||||
self.assertEqual(locales.match_locale('zh-hans-cn', locale_tag_list), 'zh-CN')
|
||||
self.assertEqual(locales.match_locale('zh-hant', locale_tag_list), 'zh-TW')
|
||||
self.assertEqual(locales.match_locale('zh-hant-tw', locale_tag_list), 'zh-TW')
|
||||
|
||||
# Test individual locale lists
|
||||
|
||||
self.assertEqual(locales.match_locale('es', [], fallback='fallback'), 'fallback')
|
||||
|
||||
self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
|
||||
self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
|
||||
self.assertEqual(locales.match_locale('es', ['ES']), 'ES')
|
||||
self.assertEqual(locales.match_locale('es', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
|
||||
self.assertEqual(locales.match_locale('es-AR', ['es-AR', 'es-ES', 'es-MX']), 'es-AR')
|
||||
self.assertEqual(locales.match_locale('es-CO', ['es-AR', 'es-ES']), 'es-ES')
|
||||
self.assertEqual(locales.match_locale('es-CO', ['es-AR']), 'es-AR')
|
||||
|
||||
# Tests from the commit message of 9ae409a05a
|
||||
|
||||
# Assumption:
|
||||
# A. When a user selects a language the results should be optimized according to
|
||||
# the selected language.
|
||||
#
|
||||
# B. When user selects a language and a territory the results should be
|
||||
# optimized with first priority on territory and second on language.
|
||||
|
||||
# Assume we have an engine that supports the follwoing locales:
|
||||
locale_tag_list = ['zh-CN', 'zh-HK', 'nl-BE', 'fr-CA']
|
||||
|
||||
# Examples (Assumption A.)
|
||||
# ------------------------
|
||||
|
||||
# A user selects region 'zh-TW' which should end in zh_HK.
|
||||
# hint: CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')
|
||||
self.assertEqual(locales.match_locale('zh-TW', locale_tag_list), 'zh-HK')
|
||||
|
||||
# A user selects only the language 'zh' which should end in CN
|
||||
self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh-CN')
|
||||
|
||||
# A user selects only the language 'fr' which should end in fr_CA
|
||||
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-CA')
|
||||
|
||||
# The difference in priority on the territory is best shown with a
|
||||
# engine that supports the following locales:
|
||||
locale_tag_list = ['fr-FR', 'fr-CA', 'en-GB', 'nl-BE']
|
||||
|
||||
# A user selects only a language
|
||||
self.assertEqual(locales.match_locale('en', locale_tag_list), 'en-GB')
|
||||
|
||||
# hint: the engine supports fr_FR and fr_CA since no territory is given,
|
||||
# fr_FR takes priority ..
|
||||
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-FR')
|
||||
|
||||
# Examples (Assumption B.)
|
||||
# ------------------------
|
||||
|
||||
# A user selects region 'fr-BE' which should end in nl-BE
|
||||
self.assertEqual(locales.match_locale('fr-BE', locale_tag_list), 'nl-BE')
|
||||
|
||||
# If the user selects a language and there are two locales like the
|
||||
# following:
|
||||
|
||||
locale_tag_list = ['fr-BE', 'fr-CH']
|
||||
|
||||
# The get_engine_locale selects the locale by looking at the "population
|
||||
# percent" and this percentage has an higher amount in BE (68.%)
|
||||
# compared to CH (21%)
|
||||
|
||||
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-BE')
|
|
@ -87,39 +87,6 @@ class TestUtils(SearxTestCase):
|
|||
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
||||
self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
|
||||
|
||||
def test_match_language(self):
|
||||
self.assertEqual(utils.match_language('es', ['es']), 'es')
|
||||
self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
|
||||
self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
|
||||
|
||||
# handle script tags
|
||||
self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN')
|
||||
self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW')
|
||||
self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN')
|
||||
self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW')
|
||||
self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN')
|
||||
self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW')
|
||||
|
||||
aliases = {'en-GB': 'en-UK', 'he': 'iw'}
|
||||
|
||||
# guess country
|
||||
self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
|
||||
self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
|
||||
self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
|
||||
self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
|
||||
self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
|
||||
self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
|
||||
|
||||
# language aliases
|
||||
self.assertEqual(utils.match_language('iw', ['he']), 'he')
|
||||
self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
|
||||
self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
|
||||
self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
|
||||
self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
|
||||
self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
|
||||
self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
|
||||
self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
|
||||
|
||||
def test_ecma_unscape(self):
|
||||
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
|
||||
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
|
||||
|
|
|
@ -52,9 +52,6 @@ enabled_plugins:
|
|||
|
||||
engines:
|
||||
|
||||
- name: google
|
||||
use_mobile_ui: true
|
||||
|
||||
# - name: fdroid
|
||||
# disabled: false
|
||||
#
|
||||
|
|
Loading…
Reference in a new issue