From 9301ad0bdab08ddaf9ab572aea2ed03667793d1a Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 10:00:52 +0100 Subject: [PATCH 01/10] added polyglot to requirements.txt --- requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/requirements.txt b/requirements.txt index cbddbb9..29da098 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,7 @@ Flask-Limiter==1.4 waitress==1.4.4 langdetect==1.0.8 expiringdict==1.2.1 +pyicu==2.6 +pycld2==0.41 +morfessor==2.0.6 +polyglot==16.7.4 From 51980f6ff54d702bf4edbe1d334316d05ae441c8 Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 10:01:12 +0100 Subject: [PATCH 02/10] use polyglot for detecting the language --- app/app.py | 28 ++++++---------------------- app/language.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/app/app.py b/app/app.py index 72dbbc7..799f4eb 100644 --- a/app/app.py +++ b/app/app.py @@ -6,6 +6,7 @@ from langdetect import detect_langs from langdetect import DetectorFactory from pkg_resources import resource_filename from .api_keys import Database +from app.language import detect_languages DetectorFactory.seed = 0 # deterministic @@ -57,11 +58,6 @@ def create_app(args): from app.language import languages app = Flask(__name__) - # For faster access - language_map = {} - for l in languages: - language_map[l.code] = l.name - if args.debug: app.config['TEMPLATES_AUTO_RELOAD'] = True @@ -271,19 +267,12 @@ def create_app(args): abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit)) if source_lang == 'auto': - candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) + candidate_langs = detect_languages(q) - if len(candidate_langs) > 0: - candidate_langs.sort(key=lambda l: l.prob, reverse=True) + if args.debug: + print(candidate_langs) - if args.debug: - print(candidate_langs) - - source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None) - if not source_lang: - source_lang = 'en' - else: - source_lang = 'en' + source_lang = candidate_langs[0]["language"] if args.debug: print("Auto detected: %s" % source_lang) @@ -385,12 +374,7 @@ def create_app(args): if not q: abort(400, description="Invalid request: missing q parameter") - candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) - candidate_langs.sort(key=lambda l: l.prob, reverse=True) - return jsonify([{ - 'confidence': l.prob, - 'language': l.lang - } for l in candidate_langs]) + return jsonify(detect_languages(q)) @app.route("/frontend/settings") diff --git a/app/language.py b/app/language.py index 279f2c8..2a1b71c 100644 --- a/app/language.py +++ b/app/language.py @@ -1,3 +1,36 @@ from argostranslate import translate +from polyglot.detect.base import Detector -languages = translate.load_installed_languages() \ No newline at end of file + +languages = translate.load_installed_languages() + + +__lang_codes = [l.code for l in languages] + + +def detect_languages(text): + f = Detector(text).languages + + # get the candidates + candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, f)) + + # this happens if no language can be detected + if not candidate_langs: + # use language "en" by default but with zero confidence + return [ + { + 'confidence': 0.0, + 'language': "en" + } + ] + + # sort the candidates descending based on the detected confidence + candidate_langs.sort(key=lambda l: l.confidence, reverse=True) + + return [ + { + 'confidence': l.confidence, + 'language': l.code + } + for l in candidate_langs + ] From d4cb859c8dea6cb9acfa0d976b10f4bf1572b3e2 Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 10:03:53 +0100 Subject: [PATCH 03/10] removed langdetect dependency --- app/app.py | 4 ---- requirements.txt | 1 - 2 files changed, 5 deletions(-) diff --git a/app/app.py b/app/app.py index 799f4eb..f1b7b70 100644 --- a/app/app.py +++ b/app/app.py @@ -2,14 +2,10 @@ import os from flask import Flask, render_template, jsonify, request, abort, send_from_directory from flask_swagger import swagger from flask_swagger_ui import get_swaggerui_blueprint -from langdetect import detect_langs -from langdetect import DetectorFactory from pkg_resources import resource_filename from .api_keys import Database from app.language import detect_languages -DetectorFactory.seed = 0 # deterministic - api_keys_db = None def get_json_dict(request): diff --git a/requirements.txt b/requirements.txt index 29da098..30763a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ flask-swagger==0.2.14 flask-swagger-ui==3.36.0 Flask-Limiter==1.4 waitress==1.4.4 -langdetect==1.0.8 expiringdict==1.2.1 pyicu==2.6 pycld2==0.41 From 36fee9bf1b45d6c111e4faac28ace68876aab4f7 Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 10:52:38 +0100 Subject: [PATCH 04/10] allow batch processing for language detection --- app/language.py | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/app/language.py b/app/language.py index 2a1b71c..17d1822 100644 --- a/app/language.py +++ b/app/language.py @@ -9,12 +9,25 @@ __lang_codes = [l.code for l in languages] def detect_languages(text): - f = Detector(text).languages + # detect batch processing + if isinstance(text, list): + is_batch = True + else: + is_batch = False + text = [text] # get the candidates - candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, f)) + candidates = [] + for t in text: + candidates.extend(Detector(t).languages) - # this happens if no language can be detected + # total read bytes of the provided text + read_bytes_total = sum(c.read_bytes for c in candidates) + + # only use candidates that are supported by argostranslate + candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates)) + + # this happens if no language could be detected if not candidate_langs: # use language "en" by default but with zero confidence return [ @@ -24,8 +37,29 @@ def detect_languages(text): } ] + # for multiple occurrences of the same language (can happen on batch detection) + # calculate the average confidence for each language + if is_batch: + temp_average_list = [] + for lang_code in __lang_codes: + # get all candidates for a specific language + lc = list(filter(lambda l: l.code == lang_code, candidate_langs)) + if len(lc) > 1: + # if more than one is present, calculate the average confidence + lang = lc[0] + lang.confidence = sum(l.confidence for l in lc) / len(lc) + lang.read_bytes = sum(l.read_bytes for l in lc) + temp_average_list.append(lang) + elif lc: + # otherwise just add it to the temporary list + temp_average_list.append(lc[0]) + + if temp_average_list: + # replace the list + candidate_langs = temp_average_list + # sort the candidates descending based on the detected confidence - candidate_langs.sort(key=lambda l: l.confidence, reverse=True) + candidate_langs.sort(key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True) return [ { From 25fb3b3c21129564ebb539000bf886cce088d4fb Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 12:32:26 +0100 Subject: [PATCH 05/10] added transliteration before actual translation -> e.g. if the source language is Russian, argostranslate expects a cyrillic text --- app/app.py | 6 +++--- app/language.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/app/app.py b/app/app.py index f1b7b70..d673ff2 100644 --- a/app/app.py +++ b/app/app.py @@ -4,7 +4,7 @@ from flask_swagger import swagger from flask_swagger_ui import get_swaggerui_blueprint from pkg_resources import resource_filename from .api_keys import Database -from app.language import detect_languages +from app.language import detect_languages, transliterate api_keys_db = None @@ -285,9 +285,9 @@ def create_app(args): try: if batch: - return jsonify({"translatedText": [translator.translate(text) for text in q] }) + return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] }) else: - return jsonify({"translatedText": translator.translate(q) }) + return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) }) except Exception as e: abort(500, description="Cannot translate text: %s" % str(e)) diff --git a/app/language.py b/app/language.py index 17d1822..e6f3613 100644 --- a/app/language.py +++ b/app/language.py @@ -1,5 +1,8 @@ +import string + from argostranslate import translate from polyglot.detect.base import Detector +from polyglot.transliteration.base import Transliterator languages = translate.load_installed_languages() @@ -68,3 +71,51 @@ def detect_languages(text): } for l in candidate_langs ] + + +def __transliterate_line(transliterator, line_text): + new_text = [] + + # transliteration is done word by word + for orig_word in line_text.split(" "): + # remove any punctuation on the right side + r_word = orig_word.rstrip(string.punctuation) + r_diff = set(char for char in orig_word) - set(char for char in r_word) + # and on the left side + l_word = orig_word.lstrip(string.punctuation) + l_diff = set(char for char in orig_word) - set(char for char in l_word) + + # the actual transliteration of the word + t_word = transliterator.transliterate(orig_word.strip(string.punctuation)) + + # if transliteration fails, default back to the original word + if not t_word: + t_word = orig_word + else: + # add back any stripped punctuation + if r_diff: + t_word = t_word + ''.join(r_diff) + if l_diff: + t_word = ''.join(l_diff) + t_word + + new_text.append(t_word) + + # rebuild the text + return " ".join(new_text) + + +def transliterate(text, target_lang="en"): + # initialize the transliterator from polyglot + transliterator = Transliterator(target_lang=target_lang) + + # check for multiline string + if "\n" in text: + lines = [] + # process each line separate + for line in text.split("\n"): + lines.append(__transliterate_line(transliterator, line)) + + # rejoin multiline string + return "\n".join(lines) + else: + return __transliterate_line(transliterator, text) From e0693d697e65e6d620433c7476f8bca5f15cc7b7 Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 12:33:54 +0100 Subject: [PATCH 06/10] download the transliteration packages of polyglot during boot --- app/init.py | 29 +++++++++++++++++++++++++++-- install_models.py | 3 ++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/app/init.py b/app/init.py index c540c1c..8b4ef65 100644 --- a/app/init.py +++ b/app/init.py @@ -2,9 +2,12 @@ import os from pathlib import Path from argostranslate import settings, package, translate import os, glob, shutil, zipfile +from app.language import languages +import polyglot def boot(): - check_and_install_models() + check_and_install_models() + check_and_install_transliteration() def check_and_install_models(force=False): if len(package.get_installed_packages()) < 2 or force: @@ -23,4 +26,26 @@ def check_and_install_models(force=False): package.install_from_path(download_path) print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages))) - \ No newline at end of file + + +def check_and_install_transliteration(force=False): + # 'en' is not a supported transliteration language + transliteration_languages = [l.code for l in languages if l.code != "en"] + + # check installed + install_needed = [] + if not force: + t_packages_path = Path(polyglot.polyglot_path) / "transliteration2" + for lang in transliteration_languages: + if not (t_packages_path / lang / f"transliteration.{lang}.tar.bz2").exists(): + install_needed.append(lang) + else: + install_needed = transliteration_languages + + # install the needed transliteration packages + if install_needed: + from polyglot.downloader import Downloader + downloader = Downloader() + + for lang in install_needed: + downloader.download(f"transliteration2.{lang}") diff --git a/install_models.py b/install_models.py index 5492f1a..4768c3c 100755 --- a/install_models.py +++ b/install_models.py @@ -1,6 +1,7 @@ #!/usr/bin/env python -from app.init import check_and_install_models +from app.init import check_and_install_models, check_and_install_transliteration if __name__ == "__main__": check_and_install_models(force=True) + check_and_install_transliteration(force=True) From dd128162efa427c9832e48053e43081d857bc5d5 Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 13:34:48 +0100 Subject: [PATCH 07/10] use appdirs to define path to polyglot data --- app/__init__.py | 8 ++++++++ requirements.txt | 1 + 2 files changed, 9 insertions(+) diff --git a/app/__init__.py b/app/__init__.py index 1fb8330..53554d9 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,2 +1,10 @@ +import os +from appdirs import user_data_dir + +# override polyglot path +import polyglot +polyglot.polyglot_path = os.path.join(user_data_dir(appname="LibreTranslate", appauthor="uav4geo"), "polyglot_data") + + from .main import main from .manage import manage diff --git a/requirements.txt b/requirements.txt index 30763a7..627a875 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pyicu==2.6 pycld2==0.41 morfessor==2.0.6 polyglot==16.7.4 +appdirs==1.4.4 From 7967c1b2c48fd2ddb31cf8aef6366cd66e43310e Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 14:38:55 +0100 Subject: [PATCH 08/10] reload installed language models after updating them --- app/init.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/init.py b/app/init.py index 8b4ef65..21507c6 100644 --- a/app/init.py +++ b/app/init.py @@ -25,6 +25,9 @@ def check_and_install_models(force=False): download_path = available_package.download() package.install_from_path(download_path) + # reload installed languages + global languages + languages = translate.load_installed_languages() print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages))) From 44da802a7fbd01a1685e8c95070b706c9a6066bc Mon Sep 17 00:00:00 2001 From: mammo0 Date: Thu, 11 Mar 2021 14:39:28 +0100 Subject: [PATCH 09/10] some more output when installing transliteration models --- app/init.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/init.py b/app/init.py index 21507c6..9cf83f7 100644 --- a/app/init.py +++ b/app/init.py @@ -47,6 +47,8 @@ def check_and_install_transliteration(force=False): # install the needed transliteration packages if install_needed: + print(f"Installing transliteration models for the following languages: {', '.join(install_needed)}") + from polyglot.downloader import Downloader downloader = Downloader() From 79224edfe4dd2e30b16fda339419ab61d088eb4c Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Fri, 12 Mar 2021 10:53:09 -0500 Subject: [PATCH 10/10] Catch unknown language --- app/language.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/app/language.py b/app/language.py index e6f3613..e7a2d49 100644 --- a/app/language.py +++ b/app/language.py @@ -1,7 +1,7 @@ import string from argostranslate import translate -from polyglot.detect.base import Detector +from polyglot.detect.base import Detector, UnknownLanguage from polyglot.transliteration.base import Transliterator @@ -22,7 +22,10 @@ def detect_languages(text): # get the candidates candidates = [] for t in text: - candidates.extend(Detector(t).languages) + try: + candidates.extend(Detector(t).languages) + except UnknownLanguage as e: + pass # total read bytes of the provided text read_bytes_total = sum(c.read_bytes for c in candidates)