mirror of
https://github.com/LibreTranslate/LibreTranslate.git
synced 2024-11-25 09:21:04 +00:00
Merge pull request #60 from mammo0/polyglot
use polyglot for language detection and transliteration
This commit is contained in:
commit
fb031b826a
6 changed files with 178 additions and 34 deletions
|
@ -1,2 +1,10 @@
|
||||||
|
import os
|
||||||
|
from appdirs import user_data_dir
|
||||||
|
|
||||||
|
# override polyglot path
|
||||||
|
import polyglot
|
||||||
|
polyglot.polyglot_path = os.path.join(user_data_dir(appname="LibreTranslate", appauthor="uav4geo"), "polyglot_data")
|
||||||
|
|
||||||
|
|
||||||
from .main import main
|
from .main import main
|
||||||
from .manage import manage
|
from .manage import manage
|
||||||
|
|
32
app/app.py
32
app/app.py
|
@ -2,12 +2,9 @@ import os
|
||||||
from flask import Flask, render_template, jsonify, request, abort, send_from_directory
|
from flask import Flask, render_template, jsonify, request, abort, send_from_directory
|
||||||
from flask_swagger import swagger
|
from flask_swagger import swagger
|
||||||
from flask_swagger_ui import get_swaggerui_blueprint
|
from flask_swagger_ui import get_swaggerui_blueprint
|
||||||
from langdetect import detect_langs
|
|
||||||
from langdetect import DetectorFactory
|
|
||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
from .api_keys import Database
|
from .api_keys import Database
|
||||||
|
from app.language import detect_languages, transliterate
|
||||||
DetectorFactory.seed = 0 # deterministic
|
|
||||||
|
|
||||||
api_keys_db = None
|
api_keys_db = None
|
||||||
|
|
||||||
|
@ -57,11 +54,6 @@ def create_app(args):
|
||||||
from app.language import languages
|
from app.language import languages
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
# For faster access
|
|
||||||
language_map = {}
|
|
||||||
for l in languages:
|
|
||||||
language_map[l.code] = l.name
|
|
||||||
|
|
||||||
if args.debug:
|
if args.debug:
|
||||||
app.config['TEMPLATES_AUTO_RELOAD'] = True
|
app.config['TEMPLATES_AUTO_RELOAD'] = True
|
||||||
|
|
||||||
|
@ -271,19 +263,12 @@ def create_app(args):
|
||||||
abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit))
|
abort(400, description="Invalid request: Request (%d) exceeds character limit (%d)" % (chars, args.char_limit))
|
||||||
|
|
||||||
if source_lang == 'auto':
|
if source_lang == 'auto':
|
||||||
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q)))
|
candidate_langs = detect_languages(q)
|
||||||
|
|
||||||
if len(candidate_langs) > 0:
|
|
||||||
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
|
|
||||||
|
|
||||||
if args.debug:
|
if args.debug:
|
||||||
print(candidate_langs)
|
print(candidate_langs)
|
||||||
|
|
||||||
source_lang = next(iter([l.code for l in languages if l.code == candidate_langs[0].lang]), None)
|
source_lang = candidate_langs[0]["language"]
|
||||||
if not source_lang:
|
|
||||||
source_lang = 'en'
|
|
||||||
else:
|
|
||||||
source_lang = 'en'
|
|
||||||
|
|
||||||
if args.debug:
|
if args.debug:
|
||||||
print("Auto detected: %s" % source_lang)
|
print("Auto detected: %s" % source_lang)
|
||||||
|
@ -300,9 +285,9 @@ def create_app(args):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if batch:
|
if batch:
|
||||||
return jsonify({"translatedText": [translator.translate(text) for text in q] })
|
return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] })
|
||||||
else:
|
else:
|
||||||
return jsonify({"translatedText": translator.translate(q) })
|
return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) })
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
abort(500, description="Cannot translate text: %s" % str(e))
|
abort(500, description="Cannot translate text: %s" % str(e))
|
||||||
|
|
||||||
|
@ -385,12 +370,7 @@ def create_app(args):
|
||||||
if not q:
|
if not q:
|
||||||
abort(400, description="Invalid request: missing q parameter")
|
abort(400, description="Invalid request: missing q parameter")
|
||||||
|
|
||||||
candidate_langs = list(filter(lambda l: l.lang in language_map, detect_langs(q)))
|
return jsonify(detect_languages(q))
|
||||||
candidate_langs.sort(key=lambda l: l.prob, reverse=True)
|
|
||||||
return jsonify([{
|
|
||||||
'confidence': l.prob,
|
|
||||||
'language': l.lang
|
|
||||||
} for l in candidate_langs])
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/frontend/settings")
|
@app.route("/frontend/settings")
|
||||||
|
|
30
app/init.py
30
app/init.py
|
@ -2,9 +2,12 @@ import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from argostranslate import settings, package, translate
|
from argostranslate import settings, package, translate
|
||||||
import os, glob, shutil, zipfile
|
import os, glob, shutil, zipfile
|
||||||
|
from app.language import languages
|
||||||
|
import polyglot
|
||||||
|
|
||||||
def boot():
|
def boot():
|
||||||
check_and_install_models()
|
check_and_install_models()
|
||||||
|
check_and_install_transliteration()
|
||||||
|
|
||||||
def check_and_install_models(force=False):
|
def check_and_install_models(force=False):
|
||||||
if len(package.get_installed_packages()) < 2 or force:
|
if len(package.get_installed_packages()) < 2 or force:
|
||||||
|
@ -22,5 +25,32 @@ def check_and_install_models(force=False):
|
||||||
download_path = available_package.download()
|
download_path = available_package.download()
|
||||||
package.install_from_path(download_path)
|
package.install_from_path(download_path)
|
||||||
|
|
||||||
|
# reload installed languages
|
||||||
|
global languages
|
||||||
|
languages = translate.load_installed_languages()
|
||||||
print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages)))
|
print("Loaded support for %s languages (%s models total)!" % (len(translate.load_installed_languages()), len(available_packages)))
|
||||||
|
|
||||||
|
|
||||||
|
def check_and_install_transliteration(force=False):
|
||||||
|
# 'en' is not a supported transliteration language
|
||||||
|
transliteration_languages = [l.code for l in languages if l.code != "en"]
|
||||||
|
|
||||||
|
# check installed
|
||||||
|
install_needed = []
|
||||||
|
if not force:
|
||||||
|
t_packages_path = Path(polyglot.polyglot_path) / "transliteration2"
|
||||||
|
for lang in transliteration_languages:
|
||||||
|
if not (t_packages_path / lang / f"transliteration.{lang}.tar.bz2").exists():
|
||||||
|
install_needed.append(lang)
|
||||||
|
else:
|
||||||
|
install_needed = transliteration_languages
|
||||||
|
|
||||||
|
# install the needed transliteration packages
|
||||||
|
if install_needed:
|
||||||
|
print(f"Installing transliteration models for the following languages: {', '.join(install_needed)}")
|
||||||
|
|
||||||
|
from polyglot.downloader import Downloader
|
||||||
|
downloader = Downloader()
|
||||||
|
|
||||||
|
for lang in install_needed:
|
||||||
|
downloader.download(f"transliteration2.{lang}")
|
||||||
|
|
121
app/language.py
121
app/language.py
|
@ -1,3 +1,124 @@
|
||||||
|
import string
|
||||||
|
|
||||||
from argostranslate import translate
|
from argostranslate import translate
|
||||||
|
from polyglot.detect.base import Detector, UnknownLanguage
|
||||||
|
from polyglot.transliteration.base import Transliterator
|
||||||
|
|
||||||
|
|
||||||
languages = translate.load_installed_languages()
|
languages = translate.load_installed_languages()
|
||||||
|
|
||||||
|
|
||||||
|
__lang_codes = [l.code for l in languages]
|
||||||
|
|
||||||
|
|
||||||
|
def detect_languages(text):
|
||||||
|
# detect batch processing
|
||||||
|
if isinstance(text, list):
|
||||||
|
is_batch = True
|
||||||
|
else:
|
||||||
|
is_batch = False
|
||||||
|
text = [text]
|
||||||
|
|
||||||
|
# get the candidates
|
||||||
|
candidates = []
|
||||||
|
for t in text:
|
||||||
|
try:
|
||||||
|
candidates.extend(Detector(t).languages)
|
||||||
|
except UnknownLanguage as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# total read bytes of the provided text
|
||||||
|
read_bytes_total = sum(c.read_bytes for c in candidates)
|
||||||
|
|
||||||
|
# only use candidates that are supported by argostranslate
|
||||||
|
candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates))
|
||||||
|
|
||||||
|
# this happens if no language could be detected
|
||||||
|
if not candidate_langs:
|
||||||
|
# use language "en" by default but with zero confidence
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'confidence': 0.0,
|
||||||
|
'language': "en"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# for multiple occurrences of the same language (can happen on batch detection)
|
||||||
|
# calculate the average confidence for each language
|
||||||
|
if is_batch:
|
||||||
|
temp_average_list = []
|
||||||
|
for lang_code in __lang_codes:
|
||||||
|
# get all candidates for a specific language
|
||||||
|
lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
|
||||||
|
if len(lc) > 1:
|
||||||
|
# if more than one is present, calculate the average confidence
|
||||||
|
lang = lc[0]
|
||||||
|
lang.confidence = sum(l.confidence for l in lc) / len(lc)
|
||||||
|
lang.read_bytes = sum(l.read_bytes for l in lc)
|
||||||
|
temp_average_list.append(lang)
|
||||||
|
elif lc:
|
||||||
|
# otherwise just add it to the temporary list
|
||||||
|
temp_average_list.append(lc[0])
|
||||||
|
|
||||||
|
if temp_average_list:
|
||||||
|
# replace the list
|
||||||
|
candidate_langs = temp_average_list
|
||||||
|
|
||||||
|
# sort the candidates descending based on the detected confidence
|
||||||
|
candidate_langs.sort(key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'confidence': l.confidence,
|
||||||
|
'language': l.code
|
||||||
|
}
|
||||||
|
for l in candidate_langs
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def __transliterate_line(transliterator, line_text):
|
||||||
|
new_text = []
|
||||||
|
|
||||||
|
# transliteration is done word by word
|
||||||
|
for orig_word in line_text.split(" "):
|
||||||
|
# remove any punctuation on the right side
|
||||||
|
r_word = orig_word.rstrip(string.punctuation)
|
||||||
|
r_diff = set(char for char in orig_word) - set(char for char in r_word)
|
||||||
|
# and on the left side
|
||||||
|
l_word = orig_word.lstrip(string.punctuation)
|
||||||
|
l_diff = set(char for char in orig_word) - set(char for char in l_word)
|
||||||
|
|
||||||
|
# the actual transliteration of the word
|
||||||
|
t_word = transliterator.transliterate(orig_word.strip(string.punctuation))
|
||||||
|
|
||||||
|
# if transliteration fails, default back to the original word
|
||||||
|
if not t_word:
|
||||||
|
t_word = orig_word
|
||||||
|
else:
|
||||||
|
# add back any stripped punctuation
|
||||||
|
if r_diff:
|
||||||
|
t_word = t_word + ''.join(r_diff)
|
||||||
|
if l_diff:
|
||||||
|
t_word = ''.join(l_diff) + t_word
|
||||||
|
|
||||||
|
new_text.append(t_word)
|
||||||
|
|
||||||
|
# rebuild the text
|
||||||
|
return " ".join(new_text)
|
||||||
|
|
||||||
|
|
||||||
|
def transliterate(text, target_lang="en"):
|
||||||
|
# initialize the transliterator from polyglot
|
||||||
|
transliterator = Transliterator(target_lang=target_lang)
|
||||||
|
|
||||||
|
# check for multiline string
|
||||||
|
if "\n" in text:
|
||||||
|
lines = []
|
||||||
|
# process each line separate
|
||||||
|
for line in text.split("\n"):
|
||||||
|
lines.append(__transliterate_line(transliterator, line))
|
||||||
|
|
||||||
|
# rejoin multiline string
|
||||||
|
return "\n".join(lines)
|
||||||
|
else:
|
||||||
|
return __transliterate_line(transliterator, text)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
from app.init import check_and_install_models
|
from app.init import check_and_install_models, check_and_install_transliteration
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
check_and_install_models(force=True)
|
check_and_install_models(force=True)
|
||||||
|
check_and_install_transliteration(force=True)
|
||||||
|
|
|
@ -4,5 +4,9 @@ flask-swagger==0.2.14
|
||||||
flask-swagger-ui==3.36.0
|
flask-swagger-ui==3.36.0
|
||||||
Flask-Limiter==1.4
|
Flask-Limiter==1.4
|
||||||
waitress==1.4.4
|
waitress==1.4.4
|
||||||
langdetect==1.0.8
|
|
||||||
expiringdict==1.2.1
|
expiringdict==1.2.1
|
||||||
|
pyicu==2.6
|
||||||
|
pycld2==0.41
|
||||||
|
morfessor==2.0.6
|
||||||
|
polyglot==16.7.4
|
||||||
|
appdirs==1.4.4
|
||||||
|
|
Loading…
Reference in a new issue