(feat) add jisho.org

Closes #1016
This commit is contained in:
Austin Huang 2022-03-31 14:45:39 -04:00
parent d1334beb4f
commit 934ae4e086
No known key found for this signature in database
GPG key ID: 84C23AA04587A91F
2 changed files with 131 additions and 0 deletions

125
searx/engines/jisho.py Normal file
View file

@ -0,0 +1,125 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Jisho (the Japanese-English dictionary)
"""
import json
from urllib.parse import urlencode, urljoin
# about
about = {
"website": 'https://jisho.org',
"wikidata_id": 'Q24568389',
"official_api_documentation": "https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api",
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['dictionaries']
paging = False
URL = 'https://jisho.org'
BASE_URL = 'https://jisho.org/word/'
SEARCH_URL = URL + '/api/v1/search/words?{query}'
def request(query, params):
query = urlencode({'keyword': query})
params['url'] = SEARCH_URL.format(query=query)
logger.debug(f"query_url --> {params['url']}")
return params
def response(resp):
results = []
infoboxed = False
search_results = json.loads(resp.text)
pages = search_results.get('data', [])
for page in pages:
# Entries that are purely from Wikipedia are excluded.
if page['senses'][0]['parts_of_speech'][0] != 'Wikipedia definition':
# Process alternative forms
japanese = page['japanese']
alt_forms = []
for title_raw in japanese:
if 'word' not in title_raw:
alt_forms.append(title_raw['reading'])
else:
title = title_raw['word']
if 'reading' in title_raw:
title += ' (' + title_raw['reading'] + ')'
alt_forms.append(title)
# Process definitions
definitions = []
def_raw = page['senses']
for defn_raw in def_raw:
extra = ''
if not infoboxed:
# Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
if defn_raw['tags'] != []:
if defn_raw['info'] != []:
extra += defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ' # "usually written as kana: <kana>"
else:
extra += ', '.join(defn_raw['tags']) + '. ' # abbreviation, archaism, etc.
elif defn_raw['info'] != []:
extra += ', '.join(defn_raw['info']).capitalize() + '. ' # inconsistent
if defn_raw['restrictions'] != []:
extra += 'Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. '
extra = extra[:-1]
definitions.append((
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
extra
))
content = ''
infobox_content = '''
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small><ul>
'''
for pos, engdef, extra in definitions:
if pos == 'Wikipedia definition':
infobox_content += '</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>'
if pos == '':
infobox_content += f"<li>{engdef}"
else:
infobox_content += f"<li><i>{pos}</i>: {engdef}"
if extra != '':
infobox_content += f" ({extra})"
infobox_content += '</li>'
content += f"{engdef}. "
infobox_content += '</ul>'
# For results, we'll return the URL, all alternative forms (as title),
# and all definitions (as description) truncated to 300 characters.
results.append({
'url': urljoin(BASE_URL, page['slug']),
'title': ", ".join(alt_forms),
'content': content[:300] + (content[300:] and '...')
})
# Like Wordnik, we'll return the first result in an infobox too.
if not infoboxed:
infoboxed = True
infobox_urls = []
infobox_urls.append({
'title': 'Jisho.org',
'url': urljoin(BASE_URL, page['slug'])
})
infobox = {
'infobox': alt_forms[0],
'urls': infobox_urls
}
alt_forms.pop(0)
alt_content = ''
if len(alt_forms) > 0:
alt_content = '<p><i>Other forms:</i> '
alt_content += ", ".join(alt_forms)
alt_content += '</p>'
infobox['content'] = alt_content + infobox_content
results.append(infobox)
return results

View file

@ -798,6 +798,12 @@ engines:
timeout: 3.0
disabled: true
- name: jisho
engine: jisho
shortcut: js
timeout: 4.0
disabled: true
- name: kickass
engine: kickass
shortcut: kc