Merge pull request #60 from mammo0/polyglot

use polyglot for language detection and transliteration
This commit is contained in:
Piero Toffanin 2021-03-12 10:54:41 -05:00 committed by GitHub
commit fb031b826a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 178 additions and 34 deletions

View file

@ -1,2 +1,10 @@
import os
from appdirs import user_data_dir
# override polyglot path
import polyglot
polyglot.polyglot_path = os.path.join(user_data_dir(appname="LibreTranslate", appauthor="uav4geo"), "polyglot_data")
from .main import main from .main import main
from .manage import manage from .manage import manage

View file

@ -2,12 +2,9 @@ import os
from flask import Flask, render_template, jsonify, request, abort, send_from_directory from flask import Flask, render_template, jsonify, request, abort, send_from_directory
from flask_swagger import swagger from flask_swagger import swagger
from flask_swagger_ui import get_swaggerui_blueprint from flask_swagger_ui import get_swaggerui_blueprint
from langdetect import detect_langs
from langdetect import DetectorFactory
from pkg_resources import resource_filename from pkg_resources import resource_filename
from .api_keys import Database from .api_keys import Database
from app.language import detect_languages, transliterate
DetectorFactory.seed = 0 # deterministic
api_keys_db = None api_keys_db = None
@ -57,11 +54,6 @@ def create_app(args):
from app.language import languages from app.language import languages
app = Flask(__name__) app = Flask(__name__)
# For faster access
language_map = {}
for l in languages:
language_map[l.code] = l.name
if args.debug: if args.debug:
app.config['TEMPLATES_AUTO_RELOAD'] = True app.config['TEMPLATES_AUTO_RELOAD'] = True
@ -271,19 +263,12 @@ def create_app(args):
abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit)) abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit))
if source_lang == 'auto': if source_lang == 'auto':
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) candidate_langs = detect_languages(q)
if len(candidate_langs) > 0:
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
if args.debug: if args.debug:
print(candidate_langs) print(candidate_langs)
source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None) source_lang = candidate_langs[0]["language"]
if not source_lang:
source_lang = 'en'
else:
source_lang = 'en'
if args.debug: if args.debug:
print("Auto detected: %s" % source_lang) print("Auto detected: %s" % source_lang)
@ -300,9 +285,9 @@ def create_app(args):
try: try:
if batch: if batch:
return jsonify({"translatedText": [translator.translate(text) for text in q] }) return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] })
else: else:
return jsonify({"translatedText": translator.translate(q) }) return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) })
except Exception as e: except Exception as e:
abort(500, description="Cannot translate text: %s" % str(e)) abort(500, description="Cannot translate text: %s" % str(e))
@ -385,12 +370,7 @@ def create_app(args):
if not q: if not q:
abort(400, description="Invalid request: missing q parameter") abort(400, description="Invalid request: missing q parameter")
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q))) return jsonify(detect_languages(q))
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
return jsonify([{
'confidence': l.prob,
'language': l.lang
} for l in candidate_langs])
@app.route("/frontend/settings") @app.route("/frontend/settings")

View file

@ -2,9 +2,12 @@ import os
from pathlib import Path from pathlib import Path
from argostranslate import settings, package, translate from argostranslate import settings, package, translate
import os, glob, shutil, zipfile import os, glob, shutil, zipfile
from app.language import languages
import polyglot
def boot(): def boot():
check_and_install_models() check_and_install_models()
check_and_install_transliteration()
def check_and_install_models(force=False): def check_and_install_models(force=False):
if len(package.get_installed_packages()) < 2 or force: if len(package.get_installed_packages()) < 2 or force:
@ -22,5 +25,32 @@ def check_and_install_models(force=False):
download_path = available_package.download() download_path = available_package.download()
package.install_from_path(download_path) package.install_from_path(download_path)
# reload installed languages
global languages
languages = translate.load_installed_languages()
print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages))) print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages)))
def check_and_install_transliteration(force=False):
# 'en' is not a supported transliteration language
transliteration_languages = [l.code for l in languages if l.code != "en"]
# check installed
install_needed = []
if not force:
t_packages_path = Path(polyglot.polyglot_path) / "transliteration2"
for lang in transliteration_languages:
if not (t_packages_path / lang / f"transliteration.{lang}.tar.bz2").exists():
install_needed.append(lang)
else:
install_needed = transliteration_languages
# install the needed transliteration packages
if install_needed:
print(f"Installing transliteration models for the following languages: {', '.join(install_needed)}")
from polyglot.downloader import Downloader
downloader = Downloader()
for lang in install_needed:
downloader.download(f"transliteration2.{lang}")

View file

@ -1,3 +1,124 @@
import string
from argostranslate import translate from argostranslate import translate
from polyglot.detect.base import Detector, UnknownLanguage
from polyglot.transliteration.base import Transliterator
languages = translate.load_installed_languages() languages = translate.load_installed_languages()
__lang_codes = [l.code for l in languages]
def detect_languages(text):
# detect batch processing
if isinstance(text, list):
is_batch = True
else:
is_batch = False
text = [text]
# get the candidates
candidates = []
for t in text:
try:
candidates.extend(Detector(t).languages)
except UnknownLanguage as e:
pass
# total read bytes of the provided text
read_bytes_total = sum(c.read_bytes for c in candidates)
# only use candidates that are supported by argostranslate
candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates))
# this happens if no language could be detected
if not candidate_langs:
# use language "en" by default but with zero confidence
return [
{
'confidence': 0.0,
'language': "en"
}
]
# for multiple occurrences of the same language (can happen on batch detection)
# calculate the average confidence for each language
if is_batch:
temp_average_list = []
for lang_code in __lang_codes:
# get all candidates for a specific language
lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
if len(lc) > 1:
# if more than one is present, calculate the average confidence
lang = lc[0]
lang.confidence = sum(l.confidence for l in lc) / len(lc)
lang.read_bytes = sum(l.read_bytes for l in lc)
temp_average_list.append(lang)
elif lc:
# otherwise just add it to the temporary list
temp_average_list.append(lc[0])
if temp_average_list:
# replace the list
candidate_langs = temp_average_list
# sort the candidates descending based on the detected confidence
candidate_langs.sort(key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True)
return [
{
'confidence': l.confidence,
'language': l.code
}
for l in candidate_langs
]
def __transliterate_line(transliterator, line_text):
new_text = []
# transliteration is done word by word
for orig_word in line_text.split(" "):
# remove any punctuation on the right side
r_word = orig_word.rstrip(string.punctuation)
r_diff = set(char for char in orig_word) - set(char for char in r_word)
# and on the left side
l_word = orig_word.lstrip(string.punctuation)
l_diff = set(char for char in orig_word) - set(char for char in l_word)
# the actual transliteration of the word
t_word = transliterator.transliterate(orig_word.strip(string.punctuation))
# if transliteration fails, default back to the original word
if not t_word:
t_word = orig_word
else:
# add back any stripped punctuation
if r_diff:
t_word = t_word + ''.join(r_diff)
if l_diff:
t_word = ''.join(l_diff) + t_word
new_text.append(t_word)
# rebuild the text
return " ".join(new_text)
def transliterate(text, target_lang="en"):
# initialize the transliterator from polyglot
transliterator = Transliterator(target_lang=target_lang)
# check for multiline string
if "\n" in text:
lines = []
# process each line separate
for line in text.split("\n"):
lines.append(__transliterate_line(transliterator, line))
# rejoin multiline string
return "\n".join(lines)
else:
return __transliterate_line(transliterator, text)

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
from app.init import check_and_install_models from app.init import check_and_install_models, check_and_install_transliteration
if __name__ == "__main__": if __name__ == "__main__":
check_and_install_models(force=True) check_and_install_models(force=True)
check_and_install_transliteration(force=True)

View file

@ -4,5 +4,9 @@ flask-swagger==0.2.14
flask-swagger-ui==3.36.0 flask-swagger-ui==3.36.0
Flask-Limiter==1.4 Flask-Limiter==1.4
waitress==1.4.4 waitress==1.4.4
langdetect==1.0.8
expiringdict==1.2.1 expiringdict==1.2.1
pyicu==2.6
pycld2==0.41
morfessor==2.0.6
polyglot==16.7.4
appdirs==1.4.4