mirror of
https://github.com/LibreTranslate/LibreTranslate.git
synced 2024-11-21 23:41:01 +00:00
Merge pull request #526 from pierotofy/langdetect
Use lingua for language detection
This commit is contained in:
commit
f9712c800c
5 changed files with 52 additions and 87 deletions
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
||||||
1.4.1
|
1.5.0
|
||||||
|
|
|
@ -644,6 +644,7 @@ def create_app(args):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
raise e
|
||||||
abort(500, description=_("Cannot translate text: %(text)s", text=str(e)))
|
abort(500, description=_("Cannot translate text: %(text)s", text=str(e)))
|
||||||
|
|
||||||
@bp.post("/translate_file")
|
@bp.post("/translate_file")
|
||||||
|
|
|
@ -1,83 +1,43 @@
|
||||||
# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py
|
from functools import lru_cache
|
||||||
|
|
||||||
import unicodedata
|
import linguars
|
||||||
|
from lexilang.detector import detect as lldetect
|
||||||
|
|
||||||
import pycld2 as cld2
|
|
||||||
|
|
||||||
|
|
||||||
class UnknownLanguageError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class Language:
|
class Language:
|
||||||
def __init__(self, choice):
|
def __init__(self, code, confidence):
|
||||||
name, code, confidence, bytesize = choice
|
|
||||||
self.code = code
|
self.code = code
|
||||||
self.name = name
|
|
||||||
self.confidence = float(confidence)
|
self.confidence = float(confidence)
|
||||||
self.read_bytes = int(bytesize)
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return ("name: {:<12}code: {:<9}confidence: {:>5.1f} "
|
return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
|
||||||
"read bytes:{:>6}".format(self.name, self.code,
|
|
||||||
self.confidence, self.read_bytes))
|
|
||||||
|
|
||||||
@staticmethod
|
@lru_cache(maxsize=None)
|
||||||
def from_code(code):
|
def load_detector(langcodes = ()):
|
||||||
return Language(("", code, 100, 0))
|
languages = []
|
||||||
|
for lc in langcodes:
|
||||||
|
try:
|
||||||
|
languages.append(linguars.Language.from_iso_code_639_1(lc))
|
||||||
|
except Exception:
|
||||||
|
print(f"{lc} is not supported by lingua")
|
||||||
|
pass # Not supported
|
||||||
|
|
||||||
|
return linguars.LanguageDetector(languages=languages)
|
||||||
|
|
||||||
|
|
||||||
class Detector:
|
class Detector:
|
||||||
""" Detect the language used in a snippet of text."""
|
def __init__(self, langcodes = ()):
|
||||||
|
self.langcodes = langcodes
|
||||||
def __init__(self, text, quiet=False):
|
self.detector = load_detector(langcodes)
|
||||||
""" Detector of the language used in `text`.
|
|
||||||
Args:
|
|
||||||
text (string): unicode string.
|
|
||||||
"""
|
|
||||||
self.__text = text
|
|
||||||
self.reliable = True
|
|
||||||
"""False if the detector used Best Effort strategy in detection."""
|
|
||||||
self.quiet = quiet
|
|
||||||
"""If true, exceptions will be silenced."""
|
|
||||||
self.detect(text)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def supported_languages():
|
|
||||||
"""Returns a list of the languages that can be detected by pycld2."""
|
|
||||||
return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")]
|
|
||||||
|
|
||||||
def detect(self, text):
|
def detect(self, text):
|
||||||
"""Decide which language is used to write the text.
|
if len(text) < 18:
|
||||||
The method tries first to detect the language with high reliability. If
|
code, conf = lldetect(text, self.langcodes)
|
||||||
that is not possible, the method switches to best effort strategy.
|
if conf > 0:
|
||||||
Args:
|
return [Language(code, round(conf * 100))]
|
||||||
text (string): A snippet of text, the longer it is the more reliable we
|
|
||||||
can detect the language used to write the text.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
|
|
||||||
except cld2.error as e:
|
|
||||||
if "input contains invalid UTF-8" in str(e):
|
|
||||||
# Fix for https://github.com/LibreTranslate/LibreTranslate/issues/514
|
|
||||||
# related to https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
|
|
||||||
text = ''.join([l for l in text if unicodedata.category(str(l))[0] not in ('S', 'M', 'C')])
|
|
||||||
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
if not reliable:
|
top_3_choices = self.detector.confidence(text)[:3]
|
||||||
self.reliable = False
|
if top_3_choices[0][1] == 0:
|
||||||
reliable, index, top_3_choices = cld2.detect(text, bestEffort=True)
|
return [Language("en", 0)]
|
||||||
|
return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
|
||||||
|
|
||||||
if not self.quiet and not reliable:
|
|
||||||
raise UnknownLanguageError("Try passing a longer snippet of text")
|
|
||||||
|
|
||||||
self.languages = [Language(x) for x in top_3_choices]
|
|
||||||
self.language = self.languages[0]
|
|
||||||
return self.language
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
text = f"Prediction is reliable: {self.reliable}\n"
|
|
||||||
text += "\n".join([f"Language {i+1}: {str(l)}"
|
|
||||||
for i,l in enumerate(self.languages)])
|
|
||||||
return text
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
|
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from argostranslate import translate
|
from argostranslate import translate
|
||||||
|
|
||||||
from libretranslate.detect import Detector, UnknownLanguageError
|
from libretranslate.detect import Detector
|
||||||
|
|
||||||
__languages = None
|
__languages = None
|
||||||
|
|
||||||
|
@ -13,6 +15,11 @@ def load_languages():
|
||||||
|
|
||||||
return __languages
|
return __languages
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def load_lang_codes():
|
||||||
|
languages = load_languages()
|
||||||
|
return tuple(l.code for l in languages)
|
||||||
|
|
||||||
def detect_languages(text):
|
def detect_languages(text):
|
||||||
# detect batch processing
|
# detect batch processing
|
||||||
if isinstance(text, list):
|
if isinstance(text, list):
|
||||||
|
@ -21,31 +28,24 @@ def detect_languages(text):
|
||||||
is_batch = False
|
is_batch = False
|
||||||
text = [text]
|
text = [text]
|
||||||
|
|
||||||
|
lang_codes = load_lang_codes()
|
||||||
|
|
||||||
# get the candidates
|
# get the candidates
|
||||||
candidates = []
|
candidates = []
|
||||||
for t in text:
|
for t in text:
|
||||||
try:
|
try:
|
||||||
d = Detector(t).languages
|
d = Detector(lang_codes).detect(t)
|
||||||
for i in range(len(d)):
|
for i in range(len(d)):
|
||||||
d[i].text_length = len(t)
|
d[i].text_length = len(t)
|
||||||
candidates.extend(d)
|
candidates.extend(d)
|
||||||
except UnknownLanguageError:
|
except Exception as e:
|
||||||
pass
|
print(str(e))
|
||||||
|
|
||||||
# total read bytes of the provided text
|
# total read bytes of the provided text
|
||||||
text_length_total = sum(c.text_length for c in candidates)
|
text_length_total = sum(c.text_length for c in candidates)
|
||||||
|
|
||||||
# Load language codes
|
|
||||||
languages = load_languages()
|
|
||||||
lang_codes = [l.code for l in languages]
|
|
||||||
|
|
||||||
# only use candidates that are supported by argostranslate
|
|
||||||
candidate_langs = list(
|
|
||||||
filter(lambda l: l.text_length != 0 and l.code in lang_codes, candidates)
|
|
||||||
)
|
|
||||||
|
|
||||||
# this happens if no language could be detected
|
# this happens if no language could be detected
|
||||||
if not candidate_langs:
|
if not candidates:
|
||||||
# use language "en" by default but with zero confidence
|
# use language "en" by default but with zero confidence
|
||||||
return [{"confidence": 0.0, "language": "en"}]
|
return [{"confidence": 0.0, "language": "en"}]
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ def detect_languages(text):
|
||||||
temp_average_list = []
|
temp_average_list = []
|
||||||
for lang_code in lang_codes:
|
for lang_code in lang_codes:
|
||||||
# get all candidates for a specific language
|
# get all candidates for a specific language
|
||||||
lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
|
lc = list(filter(lambda l: l.code == lang_code, candidates))
|
||||||
if len(lc) > 1:
|
if len(lc) > 1:
|
||||||
# if more than one is present, calculate the average confidence
|
# if more than one is present, calculate the average confidence
|
||||||
lang = lc[0]
|
lang = lc[0]
|
||||||
|
@ -68,14 +68,14 @@ def detect_languages(text):
|
||||||
|
|
||||||
if temp_average_list:
|
if temp_average_list:
|
||||||
# replace the list
|
# replace the list
|
||||||
candidate_langs = temp_average_list
|
candidates = temp_average_list
|
||||||
|
|
||||||
# sort the candidates descending based on the detected confidence
|
# sort the candidates descending based on the detected confidence
|
||||||
candidate_langs.sort(
|
candidates.sort(
|
||||||
key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
|
key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
|
||||||
)
|
)
|
||||||
|
|
||||||
return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]
|
return [{"confidence": l.confidence, "language": l.code} for l in candidates]
|
||||||
|
|
||||||
|
|
||||||
def improve_translation_formatting(source, translation, improve_punctuation=True):
|
def improve_translation_formatting(source, translation, improve_punctuation=True):
|
||||||
|
@ -107,6 +107,9 @@ def improve_translation_formatting(source, translation, improve_punctuation=True
|
||||||
if source.isupper():
|
if source.isupper():
|
||||||
return translation.upper()
|
return translation.upper()
|
||||||
|
|
||||||
|
if len(translation) == 0:
|
||||||
|
return source
|
||||||
|
|
||||||
if source[0].islower():
|
if source[0].islower():
|
||||||
return translation[0].lower() + translation[1:]
|
return translation[0].lower() + translation[1:]
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,8 @@ dependencies = [
|
||||||
"Flask-Session ==0.4.0",
|
"Flask-Session ==0.4.0",
|
||||||
"waitress ==2.1.2",
|
"waitress ==2.1.2",
|
||||||
"expiringdict ==1.2.2",
|
"expiringdict ==1.2.2",
|
||||||
" LTpycld2==0.42",
|
"linguars==0.4.0",
|
||||||
|
"lexilang==1.0.1",
|
||||||
"morfessor ==2.0.6",
|
"morfessor ==2.0.6",
|
||||||
"appdirs ==1.4.4",
|
"appdirs ==1.4.4",
|
||||||
"APScheduler ==3.9.1",
|
"APScheduler ==3.9.1",
|
||||||
|
|
Loading…
Reference in a new issue