diff --git a/app/__init__.py b/app/__init__.py index 1fb8330..53554d9 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,2 +1,10 @@ +import os +from appdirs import user_data_dir + +# override polyglot path +import polyglot +polyglot.polyglot_path = os.path.join(user_data_dir(appname="LibreTranslate", appauthor="uav4geo"), "polyglot_data") + + from .main import main from .manage import manage diff --git a/app/app.py b/app/app.py index 72dbbc7..d673ff2 100644 --- a/app/app.py +++ b/app/app.py @@ -2,12 +2,9 @@ import os from flask import Flask, render_template, jsonify, request, abort, send_from_directory from flask_swagger import swagger from flask_swagger_ui import get_swaggerui_blueprint -from langdetect import detect_langs -from langdetect import DetectorFactory from pkg_resources import resource_filename from .api_keys import Database - -DetectorFactory.seed = 0 # deterministic +from app.language import detect_languages, transliterate api_keys_db = None @@ -57,11 +54,6 @@ def create_app(args): from app.language import languages app = Flask(__name__) - # For faster access - language_map = {} - for l in languages: - language_map[l.code] = l.name - if args.debug: app.config['TEMPLATES_AUTO_RELOAD'] = True @@ -271,19 +263,12 @@ def create_app(args): abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit)) if source_lang == 'auto': - candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) + candidate_langs = detect_languages(q) - if len(candidate_langs) > 0: - candidate_langs.sort(key=lambda l: l.prob, reverse=True) + if args.debug: + print(candidate_langs) - if args.debug: - print(candidate_langs) - - source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None) - if not source_lang: - source_lang = 'en' - else: - source_lang = 'en' + source_lang = candidate_langs[0]["language"] if args.debug: print("Auto detected: %s" % source_lang) @@ -300,9 +285,9 @@ def create_app(args): try: if batch: - return jsonify({"translatedText": [translator.translate(text) for text in q] }) + return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] }) else: - return jsonify({"translatedText": translator.translate(q) }) + return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) }) except Exception as e: abort(500, description="Cannot translate text: %s" % str(e)) @@ -385,12 +370,7 @@ def create_app(args): if not q: abort(400, description="Invalid request: missing q parameter") - candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) - candidate_langs.sort(key=lambda l: l.prob, reverse=True) - return jsonify([{ - 'confidence': l.prob, - 'language': l.lang - } for l in candidate_langs]) + return jsonify(detect_languages(q)) @app.route("/frontend/settings") diff --git a/app/init.py b/app/init.py index c540c1c..9cf83f7 100644 --- a/app/init.py +++ b/app/init.py @@ -2,9 +2,12 @@ import os from pathlib import Path from argostranslate import settings, package, translate import os, glob, shutil, zipfile +from app.language import languages +import polyglot def boot(): - check_and_install_models() + check_and_install_models() + check_and_install_transliteration() def check_and_install_models(force=False): if len(package.get_installed_packages()) < 2 or force: @@ -22,5 +25,32 @@ def check_and_install_models(force=False): download_path = available_package.download() package.install_from_path(download_path) + # reload installed languages + global languages + languages = translate.load_installed_languages() print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages))) - \ No newline at end of file + + +def check_and_install_transliteration(force=False): + # 'en' is not a supported transliteration language + transliteration_languages = [l.code for l in languages if l.code != "en"] + + # check installed + install_needed = [] + if not force: + t_packages_path = Path(polyglot.polyglot_path) / "transliteration2" + for lang in transliteration_languages: + if not (t_packages_path / lang / f"transliteration.{lang}.tar.bz2").exists(): + install_needed.append(lang) + else: + install_needed = transliteration_languages + + # install the needed transliteration packages + if install_needed: + print(f"Installing transliteration models for the following languages: {', '.join(install_needed)}") + + from polyglot.downloader import Downloader + downloader = Downloader() + + for lang in install_needed: + downloader.download(f"transliteration2.{lang}") diff --git a/app/language.py b/app/language.py index 279f2c8..e7a2d49 100644 --- a/app/language.py +++ b/app/language.py @@ -1,3 +1,124 @@ -from argostranslate import translate +import string -languages = translate.load_installed_languages() \ No newline at end of file +from argostranslate import translate +from polyglot.detect.base import Detector, UnknownLanguage +from polyglot.transliteration.base import Transliterator + + +languages = translate.load_installed_languages() + + +__lang_codes = [l.code for l in languages] + + +def detect_languages(text): + # detect batch processing + if isinstance(text, list): + is_batch = True + else: + is_batch = False + text = [text] + + # get the candidates + candidates = [] + for t in text: + try: + candidates.extend(Detector(t).languages) + except UnknownLanguage as e: + pass + + # total read bytes of the provided text + read_bytes_total = sum(c.read_bytes for c in candidates) + + # only use candidates that are supported by argostranslate + candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates)) + + # this happens if no language could be detected + if not candidate_langs: + # use language "en" by default but with zero confidence + return [ + { + 'confidence': 0.0, + 'language': "en" + } + ] + + # for multiple occurrences of the same language (can happen on batch detection) + # calculate the average confidence for each language + if is_batch: + temp_average_list = [] + for lang_code in __lang_codes: + # get all candidates for a specific language + lc = list(filter(lambda l: l.code == lang_code, candidate_langs)) + if len(lc) > 1: + # if more than one is present, calculate the average confidence + lang = lc[0] + lang.confidence = sum(l.confidence for l in lc) / len(lc) + lang.read_bytes = sum(l.read_bytes for l in lc) + temp_average_list.append(lang) + elif lc: + # otherwise just add it to the temporary list + temp_average_list.append(lc[0]) + + if temp_average_list: + # replace the list + candidate_langs = temp_average_list + + # sort the candidates descending based on the detected confidence + candidate_langs.sort(key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True) + + return [ + { + 'confidence': l.confidence, + 'language': l.code + } + for l in candidate_langs + ] + + +def __transliterate_line(transliterator, line_text): + new_text = [] + + # transliteration is done word by word + for orig_word in line_text.split(" "): + # remove any punctuation on the right side + r_word = orig_word.rstrip(string.punctuation) + r_diff = set(char for char in orig_word) - set(char for char in r_word) + # and on the left side + l_word = orig_word.lstrip(string.punctuation) + l_diff = set(char for char in orig_word) - set(char for char in l_word) + + # the actual transliteration of the word + t_word = transliterator.transliterate(orig_word.strip(string.punctuation)) + + # if transliteration fails, default back to the original word + if not t_word: + t_word = orig_word + else: + # add back any stripped punctuation + if r_diff: + t_word = t_word + ''.join(r_diff) + if l_diff: + t_word = ''.join(l_diff) + t_word + + new_text.append(t_word) + + # rebuild the text + return " ".join(new_text) + + +def transliterate(text, target_lang="en"): + # initialize the transliterator from polyglot + transliterator = Transliterator(target_lang=target_lang) + + # check for multiline string + if "\n" in text: + lines = [] + # process each line separate + for line in text.split("\n"): + lines.append(__transliterate_line(transliterator, line)) + + # rejoin multiline string + return "\n".join(lines) + else: + return __transliterate_line(transliterator, text) diff --git a/install_models.py b/install_models.py index 5492f1a..4768c3c 100755 --- a/install_models.py +++ b/install_models.py @@ -1,6 +1,7 @@ #!/usr/bin/env python -from app.init import check_and_install_models +from app.init import check_and_install_models, check_and_install_transliteration if __name__ == "__main__": check_and_install_models(force=True) + check_and_install_transliteration(force=True) diff --git a/requirements.txt b/requirements.txt index 2370721..e13d5d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,9 @@ flask-swagger==0.2.14 flask-swagger-ui==3.36.0 Flask-Limiter==1.4 waitress==1.4.4 -langdetect==1.0.8 expiringdict==1.2.1 +pyicu==2.6 +pycld2==0.41 +morfessor==2.0.6 +polyglot==16.7.4 +appdirs==1.4.4