From 51980f6ff54d702bf4edbe1d334316d05ae441c8 Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 10:01:12 +0100 Subject: [PATCH] use polyglot for detecting the language --- app/app.py | 28 ++++++---------------------- app/language.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/app/app.py b/app/app.py index 72dbbc7..799f4eb 100644 --- a/app/app.py +++ b/app/app.py @@ -6,6 +6,7 @@ from langdetect import detect_langs from langdetect import DetectorFactory from pkg_resources import resource_filename from .api_keys import Database +from app.language import detect_languages DetectorFactory.seed = 0 # deterministic @@ -57,11 +58,6 @@ def create_app(args): from app.language import languages app = Flask(__name__) - # For faster access - language_map = {} - for l in languages: - language_map[l.code] = l.name - if args.debug: app.config['TEMPLATES_AUTO_RELOAD'] = True @@ -271,19 +267,12 @@ def create_app(args): abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit)) if source_lang == 'auto': - candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) + candidate_langs = detect_languages(q) - if len(candidate_langs) > 0: - candidate_langs.sort(key=lambda l: l.prob, reverse=True) + if args.debug: + print(candidate_langs) - if args.debug: - print(candidate_langs) - - source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None) - if not source_lang: - source_lang = 'en' - else: - source_lang = 'en' + source_lang = candidate_langs[0]["language"] if args.debug: print("Auto detected: %s" % source_lang) @@ -385,12 +374,7 @@ def create_app(args): if not q: abort(400, description="Invalid request: missing q parameter") - candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) - candidate_langs.sort(key=lambda l: l.prob, reverse=True) - return jsonify([{ - 'confidence': l.prob, - 'language': l.lang - } for l in candidate_langs]) + return jsonify(detect_languages(q)) @app.route("/frontend/settings") diff --git a/app/language.py b/app/language.py index 279f2c8..2a1b71c 100644 --- a/app/language.py +++ b/app/language.py @@ -1,3 +1,36 @@ from argostranslate import translate +from polyglot.detect.base import Detector -languages = translate.load_installed_languages() \ No newline at end of file + +languages = translate.load_installed_languages() + + +__lang_codes = [l.code for l in languages] + + +def detect_languages(text): + f = Detector(text).languages + + # get the candidates + candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, f)) + + # this happens if no language can be detected + if not candidate_langs: + # use language "en" by default but with zero confidence + return [ + { + 'confidence': 0.0, + 'language': "en" + } + ] + + # sort the candidates descending based on the detected confidence + candidate_langs.sort(key=lambda l: l.confidence, reverse=True) + + return [ + { + 'confidence': l.confidence, + 'language': l.code + } + for l in candidate_langs + ]