Merge pull request #526 from pierotofy/langdetect

Use lingua for language detection
This commit is contained in:
Piero Toffanin 2023-10-30 13:14:04 -04:00 committed by GitHub
commit f9712c800c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 52 additions and 87 deletions

View file

@ -1 +1 @@
1.4.1
1.5.0

View file

@ -644,6 +644,7 @@ def create_app(args):
}
)
except Exception as e:
raise e
abort(500, description=_("Cannot translate text: %(text)s", text=str(e)))
@bp.post("/translate_file")

View file

@ -1,83 +1,43 @@
# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py
from functools import lru_cache
import unicodedata
import linguars
from lexilang.detector import detect as lldetect
import pycld2 as cld2
class UnknownLanguageError(Exception):
pass
class Language:
def __init__(self, choice):
name, code, confidence, bytesize = choice
def __init__(self, code, confidence):
self.code = code
self.name = name
self.confidence = float(confidence)
self.read_bytes = int(bytesize)
def __str__(self):
return ("name: {:<12}code: {:<9}confidence: {:>5.1f} "
"read bytes:{:>6}".format(self.name, self.code,
self.confidence, self.read_bytes))
return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
@staticmethod
def from_code(code):
return Language(("", code, 100, 0))
@lru_cache(maxsize=None)
def load_detector(langcodes = ()):
languages = []
for lc in langcodes:
try:
languages.append(linguars.Language.from_iso_code_639_1(lc))
except Exception:
print(f"{lc} is not supported by lingua")
pass # Not supported
return linguars.LanguageDetector(languages=languages)
class Detector:
""" Detect the language used in a snippet of text."""
def __init__(self, text, quiet=False):
""" Detector of the language used in `text`.
Args:
text (string): unicode string.
"""
self.__text = text
self.reliable = True
"""False if the detector used Best Effort strategy in detection."""
self.quiet = quiet
"""If true, exceptions will be silenced."""
self.detect(text)
@staticmethod
def supported_languages():
"""Returns a list of the languages that can be detected by pycld2."""
return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")]
def __init__(self, langcodes = ()):
self.langcodes = langcodes
self.detector = load_detector(langcodes)
def detect(self, text):
"""Decide which language is used to write the text.
The method tries first to detect the language with high reliability. If
that is not possible, the method switches to best effort strategy.
Args:
text (string): A snippet of text, the longer it is the more reliable we
can detect the language used to write the text.
"""
try:
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
except cld2.error as e:
if "input contains invalid UTF-8" in str(e):
# Fix for https://github.com/LibreTranslate/LibreTranslate/issues/514
# related to https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
text = ''.join([l for l in text if unicodedata.category(str(l))[0] not in ('S', 'M', 'C')])
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
else:
raise e
if len(text) < 18:
code, conf = lldetect(text, self.langcodes)
if conf > 0:
return [Language(code, round(conf * 100))]
if not reliable:
self.reliable = False
reliable, index, top_3_choices = cld2.detect(text, bestEffort=True)
top_3_choices = self.detector.confidence(text)[:3]
if top_3_choices[0][1] == 0:
return [Language("en", 0)]
return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
if not self.quiet and not reliable:
raise UnknownLanguageError("Try passing a longer snippet of text")
self.languages = [Language(x) for x in top_3_choices]
self.language = self.languages[0]
return self.language
def __str__(self):
text = f"Prediction is reliable: {self.reliable}\n"
text += "\n".join([f"Language {i+1}: {str(l)}"
for i,l in enumerate(self.languages)])
return text

View file

@ -1,7 +1,9 @@
from functools import lru_cache
from argostranslate import translate
from libretranslate.detect import Detector, UnknownLanguageError
from libretranslate.detect import Detector
__languages = None
@ -13,6 +15,11 @@ def load_languages():
return __languages
@lru_cache(maxsize=None)
def load_lang_codes():
languages = load_languages()
return tuple(l.code for l in languages)
def detect_languages(text):
# detect batch processing
if isinstance(text, list):
@ -21,31 +28,24 @@ def detect_languages(text):
is_batch = False
text = [text]
lang_codes = load_lang_codes()
# get the candidates
candidates = []
for t in text:
try:
d = Detector(t).languages
d = Detector(lang_codes).detect(t)
for i in range(len(d)):
d[i].text_length = len(t)
candidates.extend(d)
except UnknownLanguageError:
pass
except Exception as e:
print(str(e))
# total read bytes of the provided text
text_length_total = sum(c.text_length for c in candidates)
# Load language codes
languages = load_languages()
lang_codes = [l.code for l in languages]
# only use candidates that are supported by argostranslate
candidate_langs = list(
filter(lambda l: l.text_length != 0 and l.code in lang_codes, candidates)
)
# this happens if no language could be detected
if not candidate_langs:
if not candidates:
# use language "en" by default but with zero confidence
return [{"confidence": 0.0, "language": "en"}]
@ -55,7 +55,7 @@ def detect_languages(text):
temp_average_list = []
for lang_code in lang_codes:
# get all candidates for a specific language
lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
lc = list(filter(lambda l: l.code == lang_code, candidates))
if len(lc) > 1:
# if more than one is present, calculate the average confidence
lang = lc[0]
@ -68,14 +68,14 @@ def detect_languages(text):
if temp_average_list:
# replace the list
candidate_langs = temp_average_list
candidates = temp_average_list
# sort the candidates descending based on the detected confidence
candidate_langs.sort(
candidates.sort(
key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
)
return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]
return [{"confidence": l.confidence, "language": l.code} for l in candidates]
def improve_translation_formatting(source, translation, improve_punctuation=True):
@ -107,6 +107,9 @@ def improve_translation_formatting(source, translation, improve_punctuation=True
if source.isupper():
return translation.upper()
if len(translation) == 0:
return source
if source[0].islower():
return translation[0].lower() + translation[1:]

View file

@ -42,7 +42,8 @@ dependencies = [
"Flask-Session ==0.4.0",
"waitress ==2.1.2",
"expiringdict ==1.2.2",
" LTpycld2==0.42",
"linguars==0.4.0",
"lexilang==1.0.1",
"morfessor ==2.0.6",
"appdirs ==1.4.4",
"APScheduler ==3.9.1",