mirror of
https://github.com/LibreTranslate/LibreTranslate.git
synced 2024-11-25 17:31:00 +00:00
added transliteration before actual translation
-> e.g. if the source language is Russian, argostranslate expects a cyrillic text
This commit is contained in:
parent
36fee9bf1b
commit
25fb3b3c21
2 changed files with 54 additions and 3 deletions
|
@ -4,7 +4,7 @@ from flask_swagger import swagger
|
||||||
from flask_swagger_ui import get_swaggerui_blueprint
|
from flask_swagger_ui import get_swaggerui_blueprint
|
||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
from .api_keys import Database
|
from .api_keys import Database
|
||||||
from app.language import detect_languages
|
from app.language import detect_languages, transliterate
|
||||||
|
|
||||||
api_keys_db = None
|
api_keys_db = None
|
||||||
|
|
||||||
|
@ -285,9 +285,9 @@ def create_app(args):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if batch:
|
if batch:
|
||||||
return jsonify({"translatedText": [translator.translate(text) for text in q] })
|
return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] })
|
||||||
else:
|
else:
|
||||||
return jsonify({"translatedText": translator.translate(q) })
|
return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) })
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
abort(500, description="Cannot translate text: %s" % str(e))
|
abort(500, description="Cannot translate text: %s" % str(e))
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
|
import string
|
||||||
|
|
||||||
from argostranslate import translate
|
from argostranslate import translate
|
||||||
from polyglot.detect.base import Detector
|
from polyglot.detect.base import Detector
|
||||||
|
from polyglot.transliteration.base import Transliterator
|
||||||
|
|
||||||
|
|
||||||
languages = translate.load_installed_languages()
|
languages = translate.load_installed_languages()
|
||||||
|
@ -68,3 +71,51 @@ def detect_languages(text):
|
||||||
}
|
}
|
||||||
for l in candidate_langs
|
for l in candidate_langs
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def __transliterate_line(transliterator, line_text):
|
||||||
|
new_text = []
|
||||||
|
|
||||||
|
# transliteration is done word by word
|
||||||
|
for orig_word in line_text.split(" "):
|
||||||
|
# remove any punctuation on the right side
|
||||||
|
r_word = orig_word.rstrip(string.punctuation)
|
||||||
|
r_diff = set(char for char in orig_word) - set(char for char in r_word)
|
||||||
|
# and on the left side
|
||||||
|
l_word = orig_word.lstrip(string.punctuation)
|
||||||
|
l_diff = set(char for char in orig_word) - set(char for char in l_word)
|
||||||
|
|
||||||
|
# the actual transliteration of the word
|
||||||
|
t_word = transliterator.transliterate(orig_word.strip(string.punctuation))
|
||||||
|
|
||||||
|
# if transliteration fails, default back to the original word
|
||||||
|
if not t_word:
|
||||||
|
t_word = orig_word
|
||||||
|
else:
|
||||||
|
# add back any stripped punctuation
|
||||||
|
if r_diff:
|
||||||
|
t_word = t_word + ''.join(r_diff)
|
||||||
|
if l_diff:
|
||||||
|
t_word = ''.join(l_diff) + t_word
|
||||||
|
|
||||||
|
new_text.append(t_word)
|
||||||
|
|
||||||
|
# rebuild the text
|
||||||
|
return " ".join(new_text)
|
||||||
|
|
||||||
|
|
||||||
|
def transliterate(text, target_lang="en"):
|
||||||
|
# initialize the transliterator from polyglot
|
||||||
|
transliterator = Transliterator(target_lang=target_lang)
|
||||||
|
|
||||||
|
# check for multiline string
|
||||||
|
if "\n" in text:
|
||||||
|
lines = []
|
||||||
|
# process each line separate
|
||||||
|
for line in text.split("\n"):
|
||||||
|
lines.append(__transliterate_line(transliterator, line))
|
||||||
|
|
||||||
|
# rejoin multiline string
|
||||||
|
return "\n".join(lines)
|
||||||
|
else:
|
||||||
|
return __transliterate_line(transliterator, text)
|
||||||
|
|
Loading…
Reference in a new issue