Merge pull request #60 from mammo0/polyglot

use polyglot for language detection and transliteration
This commit is contained in:
Piero Toffanin 2021-03-12 10:54:41 -05:00 committed by GitHub
commit fb031b826a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 178 additions and 34 deletions

View file

@ -1,2 +1,10 @@
import os
from appdirs import user_data_dir
# override polyglot path
import polyglot
polyglot.polyglot_path = os.path.join(user_data_dir(appname="LibreTranslate", appauthor="uav4geo"), "polyglot_data")
from .main import main
from .manage import manage

View file

@ -2,12 +2,9 @@ import os
from flask import Flask, render_template, jsonify, request, abort, send_from_directory
from flask_swagger import swagger
from flask_swagger_ui import get_swaggerui_blueprint
from langdetect import detect_langs
from langdetect import DetectorFactory
from pkg_resources import resource_filename
from .api_keys import Database
DetectorFactory.seed = 0 # deterministic
from app.language import detect_languages, transliterate
api_keys_db = None
@ -57,11 +54,6 @@ def create_app(args):
from app.language import languages
app = Flask(__name__)
# For faster access
language_map = {}
for l in languages:
language_map[l.code] = l.name
if args.debug:
app.config['TEMPLATES_AUTO_RELOAD'] = True
@ -271,19 +263,12 @@ def create_app(args):
abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit))
if source_lang == 'auto':
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q)))
if len(candidate_langs) > 0:
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
candidate_langs = detect_languages(q)
if args.debug:
print(candidate_langs)
source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None)
if not source_lang:
source_lang = 'en'
else:
source_lang = 'en'
source_lang = candidate_langs[0]["language"]
if args.debug:
print("Auto detected: %s" % source_lang)
@ -300,9 +285,9 @@ def create_app(args):
try:
if batch:
return jsonify({"translatedText": [translator.translate(text) for text in q] })
return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] })
else:
return jsonify({"translatedText": translator.translate(q) })
return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) })
except Exception as e:
abort(500, description="Cannot translate text: %s" % str(e))
@ -385,12 +370,7 @@ def create_app(args):
if not q:
abort(400, description="Invalid request: missing q parameter")
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q)))
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
return jsonify([{
'confidence': l.prob,
'language': l.lang
} for l in candidate_langs])
return jsonify(detect_languages(q))
@app.route("/frontend/settings")

View file

@ -2,9 +2,12 @@ import os
from pathlib import Path
from argostranslate import settings, package, translate
import os, glob, shutil, zipfile
from app.language import languages
import polyglot
def boot():
check_and_install_models()
check_and_install_transliteration()
def check_and_install_models(force=False):
if len(package.get_installed_packages()) < 2 or force:
@ -22,5 +25,32 @@ def check_and_install_models(force=False):
download_path = available_package.download()
package.install_from_path(download_path)
# reload installed languages
global languages
languages = translate.load_installed_languages()
print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages)))
def check_and_install_transliteration(force=False):
# 'en' is not a supported transliteration language
transliteration_languages = [l.code for l in languages if l.code != "en"]
# check installed
install_needed = []
if not force:
t_packages_path = Path(polyglot.polyglot_path) / "transliteration2"
for lang in transliteration_languages:
if not (t_packages_path / lang / f"transliteration.{lang}.tar.bz2").exists():
install_needed.append(lang)
else:
install_needed = transliteration_languages
# install the needed transliteration packages
if install_needed:
print(f"Installing transliteration models for the following languages: {', '.join(install_needed)}")
from polyglot.downloader import Downloader
downloader = Downloader()
for lang in install_needed:
downloader.download(f"transliteration2.{lang}")

View file

@ -1,3 +1,124 @@
import string
from argostranslate import translate
from polyglot.detect.base import Detector, UnknownLanguage
from polyglot.transliteration.base import Transliterator
languages = translate.load_installed_languages()
__lang_codes = [l.code for l in languages]
def detect_languages(text):
# detect batch processing
if isinstance(text, list):
is_batch = True
else:
is_batch = False
text = [text]
# get the candidates
candidates = []
for t in text:
try:
candidates.extend(Detector(t).languages)
except UnknownLanguage as e:
pass
# total read bytes of the provided text
read_bytes_total = sum(c.read_bytes for c in candidates)
# only use candidates that are supported by argostranslate
candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates))
# this happens if no language could be detected
if not candidate_langs:
# use language "en" by default but with zero confidence
return [
{
'confidence': 0.0,
'language': "en"
}
]
# for multiple occurrences of the same language (can happen on batch detection)
# calculate the average confidence for each language
if is_batch:
temp_average_list = []
for lang_code in __lang_codes:
# get all candidates for a specific language
lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
if len(lc) > 1:
# if more than one is present, calculate the average confidence
lang = lc[0]
lang.confidence = sum(l.confidence for l in lc) / len(lc)
lang.read_bytes = sum(l.read_bytes for l in lc)
temp_average_list.append(lang)
elif lc:
# otherwise just add it to the temporary list
temp_average_list.append(lc[0])
if temp_average_list:
# replace the list
candidate_langs = temp_average_list
# sort the candidates descending based on the detected confidence
candidate_langs.sort(key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True)
return [
{
'confidence': l.confidence,
'language': l.code
}
for l in candidate_langs
]
def __transliterate_line(transliterator, line_text):
new_text = []
# transliteration is done word by word
for orig_word in line_text.split(" "):
# remove any punctuation on the right side
r_word = orig_word.rstrip(string.punctuation)
r_diff = set(char for char in orig_word) - set(char for char in r_word)
# and on the left side
l_word = orig_word.lstrip(string.punctuation)
l_diff = set(char for char in orig_word) - set(char for char in l_word)
# the actual transliteration of the word
t_word = transliterator.transliterate(orig_word.strip(string.punctuation))
# if transliteration fails, default back to the original word
if not t_word:
t_word = orig_word
else:
# add back any stripped punctuation
if r_diff:
t_word = t_word + ''.join(r_diff)
if l_diff:
t_word = ''.join(l_diff) + t_word
new_text.append(t_word)
# rebuild the text
return " ".join(new_text)
def transliterate(text, target_lang="en"):
# initialize the transliterator from polyglot
transliterator = Transliterator(target_lang=target_lang)
# check for multiline string
if "\n" in text:
lines = []
# process each line separate
for line in text.split("\n"):
lines.append(__transliterate_line(transliterator, line))
# rejoin multiline string
return "\n".join(lines)
else:
return __transliterate_line(transliterator, text)

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python
from app.init import check_and_install_models
from app.init import check_and_install_models, check_and_install_transliteration
if __name__ == "__main__":
check_and_install_models(force=True)
check_and_install_transliteration(force=True)

View file

@ -4,5 +4,9 @@ flask-swagger==0.2.14
flask-swagger-ui==3.36.0
Flask-Limiter==1.4
waitress==1.4.4
langdetect==1.0.8
expiringdict==1.2.1
pyicu==2.6
pycld2==0.41
morfessor==2.0.6
polyglot==16.7.4
appdirs==1.4.4