use polyglot for detecting the language

This commit is contained in:
mammo0 2021-03-11 10:01:12 +01:00
parent 9301ad0bda
commit 51980f6ff5
2 changed files with 40 additions and 23 deletions

View file

@ -6,6 +6,7 @@ from langdetect import detect_langs
from langdetect import DetectorFactory from langdetect import DetectorFactory
from pkg_resources import resource_filename from pkg_resources import resource_filename
from .api_keys import Database from .api_keys import Database
from app.language import detect_languages
DetectorFactory.seed = 0 # deterministic DetectorFactory.seed = 0 # deterministic
@ -57,11 +58,6 @@ def create_app(args):
from app.language import languages from app.language import languages
app = Flask(__name__) app = Flask(__name__)
# For faster access
language_map = {}
for l in languages:
language_map[l.code] = l.name
if args.debug: if args.debug:
app.config['TEMPLATES_AUTO_RELOAD'] = True app.config['TEMPLATES_AUTO_RELOAD'] = True
@ -271,19 +267,12 @@ def create_app(args):
abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit)) abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit))
if source_lang == 'auto': if source_lang == 'auto':
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) candidate_langs = detect_languages(q)
if len(candidate_langs) > 0:
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
if args.debug: if args.debug:
print(candidate_langs) print(candidate_langs)
source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None) source_lang = candidate_langs[0]["language"]
if not source_lang:
source_lang = 'en'
else:
source_lang = 'en'
if args.debug: if args.debug:
print("Auto detected: %s" % source_lang) print("Auto detected: %s" % source_lang)
@ -385,12 +374,7 @@ def create_app(args):
if not q: if not q:
abort(400, description="Invalid request: missing q parameter") abort(400, description="Invalid request: missing q parameter")
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) return jsonify(detect_languages(q))
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
return jsonify([{
'confidence': l.prob,
'language': l.lang
} for l in candidate_langs])
@app.route("/frontend/settings") @app.route("/frontend/settings")

View file

@ -1,3 +1,36 @@
from argostranslate import translate from argostranslate import translate
from polyglot.detect.base import Detector
languages = translate.load_installed_languages() languages = translate.load_installed_languages()
__lang_codes = [l.code for l in languages]
def detect_languages(text):
f = Detector(text).languages
# get the candidates
candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, f))
# this happens if no language can be detected
if not candidate_langs:
# use language "en" by default but with zero confidence
return [
{
'confidence': 0.0,
'language': "en"
}
]
# sort the candidates descending based on the detected confidence
candidate_langs.sort(key=lambda l: l.confidence, reverse=True)
return [
{
'confidence': l.confidence,
'language': l.code
}
for l in candidate_langs
]