Merge pull request #333 from dalf/enh-engine-descriptions

RFC: /preferences: display engine descriptions
This commit is contained in:
Alexandre Flament 2021-09-25 11:29:25 +02:00 committed by GitHub
commit b046322c7b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
31 changed files with 4111 additions and 57 deletions

View file

@ -14,6 +14,7 @@ __all__ = [
'WIKIDATA_UNITS',
'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'ahmia_blacklist_loader',
]
@ -45,3 +46,4 @@ EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')

File diff suppressed because it is too large Load diff

View file

@ -12,6 +12,16 @@ from dateutil.parser import parse as dateparse
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://bandcamp.com/',
"wikidata_id": 'Q545966',
"official_api_documentation": 'https://bandcamp.com/developer',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['music']
paging = True

View file

@ -9,9 +9,9 @@ from searx.utils import searx_useragent
# about
about = {
"website": 'https://photon.komoot.de',
"website": 'https://photon.komoot.io',
"wikidata_id": None,
"official_api_documentation": 'https://photon.komoot.de/',
"official_api_documentation": 'https://photon.komoot.io/',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',

View file

@ -1618,7 +1618,7 @@ engines:
categories: general
about:
website: https://brave.com/search/
wikidata_id: Q22906900
wikidata_id: Q107355971
use_official_api: false
require_api_key: false
results: HTML

View file

@ -1309,6 +1309,7 @@ input.cursor-text {
font-size: 14px;
font-weight: normal;
z-index: 1000000;
max-width: 40rem;
}
td:hover .engine-tooltip,
th:hover .engine-tooltip,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1336,6 +1336,7 @@ input.cursor-text {
font-size: 14px;
font-weight: normal;
z-index: 1000000;
max-width: 40rem;
}
td:hover .engine-tooltip,
th:hover .engine-tooltip,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -299,6 +299,29 @@ $(document).ready(function(){
$( this ).off( event );
});
});
;$(document).ready(function(){
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
$.ajax("engine_descriptions.json", dataType="json").done(function(data) {
engine_descriptions = data;
for (const [engine_name, description] of Object.entries(data)) {
let elements = $('[data-engine-name="' + engine_name + '"] .description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (document.querySelector('body[class="preferences_endpoint"]')) {
$('[data-engine-name]').hover(function() {
load_engine_descriptions();
});
}
});
;$(document).ready(function(){
$("#allow-all-engines").click(function() {
$(".onoffswitch-checkbox").each(function() { this.checked = false;});

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,23 @@
$(document).ready(function(){
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
$.ajax("engine_descriptions.json", dataType="json").done(function(data) {
engine_descriptions = data;
for (const [engine_name, description] of Object.entries(data)) {
let elements = $('[data-engine-name="' + engine_name + '"] .description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (document.querySelector('body[class="preferences_endpoint"]')) {
$('[data-engine-name]').hover(function() {
load_engine_descriptions();
});
}
});

View file

@ -25,6 +25,7 @@ input.cursor-text {
font-size: 14px;
font-weight: normal;
z-index: 1000000;
max-width: 40rem;
}
td:hover .engine-tooltip, th:hover .engine-tooltip, .engine-tooltip:hover {

View file

@ -1929,6 +1929,14 @@ td:hover .engine-tooltip,
#main_preferences div.selectable_url pre {
width: 100%;
}
#main_preferences th.name .engine-tooltip {
margin-top: 1.8rem;
left: 20rem;
max-width: 40rem;
}
#main_preferences th.name .engine-tooltip .engine-description {
margin-top: 0.5rem;
}
@media screen and (max-width: 75em) {
.preferences_back {
clear: both;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1929,6 +1929,14 @@ td:hover .engine-tooltip,
#main_preferences div.selectable_url pre {
width: 100%;
}
#main_preferences th.name .engine-tooltip {
margin-top: 1.8rem;
left: 20rem;
max-width: 40rem;
}
#main_preferences th.name .engine-tooltip .engine-description {
margin-top: 0.5rem;
}
@media screen and (max-width: 75em) {
.preferences_back {
clear: both;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -621,6 +621,33 @@ searx.ready(function() {
});
});
})(window, document, window.searx);
;(function (w, d, searx) {
'use strict';
searx.ready(function() {
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
searx.http("GET", "engine_descriptions.json").then(function(content) {
engine_descriptions = JSON.parse(content);
for (const [engine_name, description] of Object.entries(engine_descriptions)) {
let elements = d.querySelectorAll('[data-engine-name="' + engine_name + '"] .engine-description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (d.querySelector('body[class="preferences_endpoint"]')) {
for(const el of d.querySelectorAll('[data-engine-name]')) {
searx.on(el, 'mouseenter', load_engine_descriptions);
}
}
});
})(window, document, window.searx);
;/**
* searx is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,27 @@
(function (w, d, searx) {
'use strict';
searx.ready(function() {
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
searx.http("GET", "engine_descriptions.json").then(function(content) {
engine_descriptions = JSON.parse(content);
for (const [engine_name, description] of Object.entries(engine_descriptions)) {
let elements = d.querySelectorAll('[data-engine-name="' + engine_name + '"] .engine-description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (d.querySelector('body[class="preferences_endpoint"]')) {
for(const el of d.querySelectorAll('[data-engine-name]')) {
searx.on(el, 'mouseenter', load_engine_descriptions);
}
}
});
})(window, document, window.searx);

View file

@ -108,6 +108,18 @@
width: 100%;
}
}
th.name {
.engine-tooltip {
margin-top: 1.8rem;
left: 20rem;
max-width: 40rem;
.engine-description {
margin-top: 0.5rem;
}
}
}
}
@media screen and (max-width: 75em) {

View file

@ -4,6 +4,7 @@
{% if search_engine.about is defined or stats[search_engine.name]['result_count'] > 0 %}
{% set about = search_engine.about %}
<div class="engine-tooltip" role="tooltip" id="{{ id }}">{{- "" -}}
<p class="description"></p>
{% if search_engine.about is defined %}
<h5><a href="{{about.website}}" rel="noreferrer">{{about.website}}</a></h5>
{%- if about.wikidata_id -%}<p><a href="https://www.wikidata.org/wiki/{{about.wikidata_id}}" rel="noreferrer">wikidata.org/wiki/{{about.wikidata_id}}</a></p>{%- endif -%}
@ -343,7 +344,7 @@
<td class="onoff-checkbox">
{{- checkbox_toggle('engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_'), (search_engine.name, categ) in disabled_engines) -}}
</td>
<th scope="row"><span aria-labelledby="{{ 'tooltip_' + categ + '_' + search_engine.name }}">
<th scope="row" data-engine-name="{{ search_engine.name }}"><span aria-labelledby="{{ 'tooltip_' + categ + '_' + search_engine.name }}">
{%- if search_engine.enable_http %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif -%}
{{- search_engine.name -}}</span>
{{- engine_about(search_engine, 'tooltip_' + categ + '_' + search_engine.name) -}}
@ -363,7 +364,7 @@
<td>{{ support_toggle(supports[search_engine.name]['safesearch']) }}</td>
<td>{{ support_toggle(supports[search_engine.name]['supports_selected_language']) }}</td>
<td>{{ shortcuts[search_engine.name] }}</td>
<th scope="row"><span>{% if search_engine.enable_http %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif %}{{ search_engine.name }}</span>{{ engine_about(search_engine) }}</th>
<th scope="row" data-engine-name="{{ search_engine.name }}"><span>{% if search_engine.enable_http %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif %}{{ search_engine.name }}</span>{{ engine_about(search_engine) }}</th>
<td class="onoff-checkbox">
{{ checkbox_toggle('engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_'), (search_engine.name, categ) in disabled_engines) }}
</td>

View file

@ -22,6 +22,7 @@
{% if search_engine.about is defined %}
{% set about = search_engine.about %}
<div class="engine-tooltip" role="tooltip">{{- "" -}}
<p class="engine-description"></p>
<p><a href="{{about.website}}" rel="noreferrer">{{about.website}}</a></p>
{%- if about.wikidata_id -%}<p><a href="https://www.wikidata.org/wiki/{{about.wikidata_id}}" rel="noreferrer">wikidata.org/wiki/{{about.wikidata_id}}</a></p>{%- endif -%}
{%- if search_engine.enable_http %}<p>{{ icon('exclamation-sign', 'No HTTPS') }}{{ _('No HTTPS')}}</p>{% endif -%}
@ -262,7 +263,7 @@
{% set engine_id = 'engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_') %}
<tr>
<td class="engine_checkbox">{{ checkbox_onoff(engine_id, (search_engine.name, categ) in disabled_engines) }}</td>
<th class="name">{% if search_engine.enable_http %}{{ icon('warning', 'No HTTPS') }}{% endif %} {{ search_engine.name }} {{ engine_about(search_engine) }}</th>
<th class="name" data-engine-name="{{ search_engine.name }}">{% if search_engine.enable_http %}{{ icon('warning', 'No HTTPS') }}{% endif %} {{ search_engine.name }} {{ engine_about(search_engine) }}</th>
<td class="shortcut">{{ shortcuts[search_engine.name] }}</td>
<td>{{ checkbox(engine_id + '_supported_languages', supports[search_engine.name]['supports_selected_language'], true, true) }}</td>
<td>{{ checkbox(engine_id + '_safesearch', supports[search_engine.name]['safesearch'], true, true) }}</td>

View file

@ -54,6 +54,7 @@ from searx import (
settings,
searx_debug,
)
from searx.data import ENGINE_DESCRIPTIONS
from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path
from searx.exceptions import SearxParameterException
@ -393,7 +394,9 @@ def image_proxify(url):
def get_translations():
return {
# when there is autocompletion
'no_item_found': gettext('No item found')
'no_item_found': gettext('No item found'),
# /preferences: the source of the engine description (wikipedata, wikidata, website)
'Source': gettext('Source'),
}
@ -1140,6 +1143,23 @@ def image_proxy():
return '', 400
@app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions():
locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy()
if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
result[engine] = description
for engine, description in result.items():
if len(description) ==2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str):
description = [ description, 'wikipedia' ]
result[engine] = description
return jsonify(result)
@app.route('/stats', methods=['GET'])
def stats():
"""Render engine statistics page."""

View file

@ -1,15 +1,19 @@
#!/usr/bin/env python
import sys
import json
from urllib.parse import quote, urlparse
import detect_language
from urllib.parse import urlparse
from os.path import join
from lxml.html import fromstring
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
from searx.engines import wikidata, set_loggers
from searx.utils import extract_text
from searx.utils import extract_text, match_language
from searx.locales import LOCALE_NAMES
import searx
from searx import searx_dir
from searx.utils import gen_useragent
import searx.search
import searx.network
@ -18,6 +22,7 @@ set_loggers(wikidata, 'wikidata')
SPARQL_WIKIPEDIA_ARTICLE = """
SELECT DISTINCT ?item ?name
WHERE {
hint:Query hint:optimizer "None".
VALUES ?item { %IDS% }
?article schema:about ?item ;
schema:inLanguage ?lang ;
@ -38,8 +43,23 @@ WHERE {
ORDER BY ?itemLang
"""
NOT_A_DESCRIPTION = [
'web site',
'site web',
'komputa serĉilo',
'interreta serĉilo',
'bilaketa motor',
'web search engine',
'wikimedia täpsustuslehekülg',
]
SKIP_ENGINE_SOURCE = [
('gitlab', 'wikidata') # descriptions are about wikipedia disambiguation pages
]
LANGUAGES = LOCALE_NAMES.keys()
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES)))
WIKIPEDIA_LANGUAGES = {'language': 'wikipedia_language'}
LANGUAGES_SPARQL = ''
IDS = None
descriptions = {}
@ -54,15 +74,30 @@ def normalize_description(description):
def update_description(engine_name, lang, description, source, replace=True):
if not isinstance(description, str):
return
description = normalize_description(description)
if description.lower() == engine_name.lower():
return
if description.lower() in NOT_A_DESCRIPTION:
return
if (engine_name, source) in SKIP_ENGINE_SOURCE:
return
if ' ' not in description:
# skip unique word description (like "website")
return
if replace or lang not in descriptions[engine_name]:
descriptions[engine_name][lang] = [normalize_description(description), source]
descriptions[engine_name][lang] = [description, source]
def get_wikipedia_summary(language, pageid):
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
url = search_url.format(title=quote(pageid), language=language)
def get_wikipedia_summary(lang, pageid):
params = {
'language': lang.replace('_','-'),
'headers': {}
}
searx.engines.engines['wikipedia'].request(pageid, params)
try:
response = searx.network.get(url)
response = searx.network.get(params['url'], headers=params['headers'], timeout=10)
response.raise_for_status()
api_result = json.loads(response.text)
return api_result.get('extract')
@ -71,15 +106,19 @@ def get_wikipedia_summary(language, pageid):
def detect_language(text):
r = cld3.get_language(str(text)) # pylint: disable=E1101
if r is not None and r.probability >= 0.98 and r.is_reliable:
return r.language
try:
r = detect_langs(str(text)) # pylint: disable=E1101
except LangDetectException:
return None
if len(r) > 0 and r[0].prob > 0.95:
return r[0].lang
return None
def get_website_description(url, lang1, lang2=None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
'User-Agent': gen_useragent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
@ -117,8 +156,15 @@ def get_website_description(url, lang1, lang2=None):
def initialize():
global descriptions, wd_to_engine_name, IDS
global IDS, WIKIPEDIA_LANGUAGES, LANGUAGES_SPARQL
searx.search.initialize()
wikipedia_engine = searx.engines.engines['wikipedia']
WIKIPEDIA_LANGUAGES = {
language: wikipedia_engine.url_lang(language.replace('_', '-'))
for language in LANGUAGES
}
WIKIPEDIA_LANGUAGES['nb_NO'] = 'no'
LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values()))
for engine_name, engine in searx.engines.engines.items():
descriptions[engine_name] = {}
wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
@ -129,7 +175,7 @@ def initialize():
def fetch_wikidata_descriptions():
global IDS
searx.network.set_timeout_for_thread(60)
result = wikidata.send_wikidata_query(
SPARQL_DESCRIPTION
.replace('%IDS%', IDS)
@ -138,15 +184,15 @@ def fetch_wikidata_descriptions():
if result is not None:
for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
lang = binding['itemDescription']['xml:lang']
wikidata_lang = binding['itemDescription']['xml:lang']
description = binding['itemDescription']['value']
if ' ' in description: # skip unique word description (like "website")
for engine_name in wd_to_engine_name[wikidata_id]:
for lang in LANGUAGES:
if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang:
update_description(engine_name, lang, description, 'wikidata')
def fetch_wikipedia_descriptions():
global IDS
result = wikidata.send_wikidata_query(
SPARQL_WIKIPEDIA_ARTICLE
.replace('%IDS%', IDS)
@ -155,11 +201,12 @@ def fetch_wikipedia_descriptions():
if result is not None:
for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
lang = binding['name']['xml:lang']
wikidata_lang = binding['name']['xml:lang']
pageid = binding['name']['value']
description = get_wikipedia_summary(lang, pageid)
if description is not None and ' ' in description:
for engine_name in wd_to_engine_name[wikidata_id]:
for lang in LANGUAGES:
if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang:
description = get_wikipedia_summary(lang, pageid)
update_description(engine_name, lang, description, 'wikipedia')
@ -173,36 +220,96 @@ def normalize_url(url):
def fetch_website_description(engine_name, website):
default_lang, default_description = get_website_description(website, None, None)
if default_lang is None or default_description is None:
# the front page can't be fetched: skip this engine
return
if default_lang not in descriptions[engine_name]:
descriptions[engine_name][default_lang] = [normalize_description(default_description), website]
for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'):
if request_lang.split('-')[0] not in descriptions[engine_name]:
lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0])
if desc is not None and desc != default_description:
update_description(engine_name, lang, desc, website, replace=False)
else:
wikipedia_languages_r = { V: K for K, V in WIKIPEDIA_LANGUAGES.items() }
languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr']
languages = languages + [ l for l in LANGUAGES if l not in languages]
previous_matched_lang = None
previous_count = 0
for lang in languages:
if lang not in descriptions[engine_name]:
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None:
continue
matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None)
if matched_lang is None:
fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
if matched_lang is not None:
update_description(engine_name, matched_lang, desc, website, replace=False)
# check if desc changed with the different lang values
if matched_lang == previous_matched_lang:
previous_count += 1
if previous_count == 6:
# the website has returned the same description for 6 different languages in Accept-Language header
# stop now
break
else:
previous_matched_lang = matched_lang
previous_count = 0
def fetch_website_descriptions():
for engine_name, engine in searx.engines.engines.items():
website = getattr(engine, "about", {}).get('website')
if website is None:
if website is None and hasattr(engine, "search_url"):
website = normalize_url(getattr(engine, "search_url"))
if website is None:
if website is None and hasattr(engine, "base_url"):
website = normalize_url(getattr(engine, "base_url"))
if website is not None:
fetch_website_description(engine_name, website)
def get_engine_descriptions_filename():
return join(join(searx_dir, "data"), "engine_descriptions.json")
def get_output():
"""
From descriptions[engine][language] = [description, source]
To
* output[language][engine] = description_and_source
* description_and_source can be:
* [description, source]
* description (if source = "wikipedia")
* [f"engine:lang", "ref"] (reference to another existing description)
"""
output = {
locale: {} for locale in LOCALE_NAMES
}
seen_descriptions = {}
for engine_name, lang_descriptions in descriptions.items():
for language, description in lang_descriptions.items():
if description[0] in seen_descriptions:
ref = seen_descriptions[description[0]]
description = [f'{ref[0]}:{ref[1]}', 'ref']
else:
seen_descriptions[description[0]] = (engine_name, language)
if description[1] == 'wikipedia':
description = description[0]
output.setdefault(language, {}).setdefault(engine_name, description)
return output
def main():
initialize()
print('Fetching wikidata descriptions')
fetch_wikidata_descriptions()
print('Fetching wikipedia descriptions')
fetch_wikipedia_descriptions()
print('Fetching website descriptions')
fetch_website_descriptions()
sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False))
output = get_output()
with open(get_engine_descriptions_filename(), 'w', encoding='utf8') as f:
f.write(json.dumps(output, indent=1, separators=(',', ':'), ensure_ascii=False))
if __name__ == "__main__":