Merge pull request #526 from pierotofy/langdetect

Use lingua for language detection
This commit is contained in:
Piero Toffanin 2023-10-30 13:14:04 -04:00 committed by GitHub
commit f9712c800c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 52 additions and 87 deletions

View file

@ -1 +1 @@
1.4.1 1.5.0

View file

@ -644,6 +644,7 @@ def create_app(args):
} }
) )
except Exception as e: except Exception as e:
raise e
abort(500, description=_("Cannot translate text: %(text)s", text=str(e))) abort(500, description=_("Cannot translate text: %(text)s", text=str(e)))
@bp.post("/translate_file") @bp.post("/translate_file")

View file

@ -1,83 +1,43 @@
# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py from functools import lru_cache
import unicodedata import linguars
from lexilang.detector import detect as lldetect
import pycld2 as cld2
class UnknownLanguageError(Exception):
pass
class Language: class Language:
def __init__(self, choice): def __init__(self, code, confidence):
name, code, confidence, bytesize = choice
self.code = code self.code = code
self.name = name
self.confidence = float(confidence) self.confidence = float(confidence)
self.read_bytes = int(bytesize)
def __str__(self): def __str__(self):
return ("name: {:<12}code: {:<9}confidence: {:>5.1f} " return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
"read bytes:{:>6}".format(self.name, self.code,
self.confidence, self.read_bytes))
@staticmethod @lru_cache(maxsize=None)
def from_code(code): def load_detector(langcodes = ()):
return Language(("", code, 100, 0)) languages = []
for lc in langcodes:
try:
languages.append(linguars.Language.from_iso_code_639_1(lc))
except Exception:
print(f"{lc} is not supported by lingua")
pass # Not supported
return linguars.LanguageDetector(languages=languages)
class Detector: class Detector:
""" Detect the language used in a snippet of text.""" def __init__(self, langcodes = ()):
self.langcodes = langcodes
def __init__(self, text, quiet=False): self.detector = load_detector(langcodes)
""" Detector of the language used in `text`.
Args:
text (string): unicode string.
"""
self.__text = text
self.reliable = True
"""False if the detector used Best Effort strategy in detection."""
self.quiet = quiet
"""If true, exceptions will be silenced."""
self.detect(text)
@staticmethod
def supported_languages():
"""Returns a list of the languages that can be detected by pycld2."""
return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")]
def detect(self, text): def detect(self, text):
"""Decide which language is used to write the text. if len(text) < 18:
The method tries first to detect the language with high reliability. If code, conf = lldetect(text, self.langcodes)
that is not possible, the method switches to best effort strategy. if conf > 0:
Args: return [Language(code, round(conf * 100))]
text (string): A snippet of text, the longer it is the more reliable we
can detect the language used to write the text.
"""
try:
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
except cld2.error as e:
if "input contains invalid UTF-8" in str(e):
# Fix for https://github.com/LibreTranslate/LibreTranslate/issues/514
# related to https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
text = ''.join([l for l in text if unicodedata.category(str(l))[0] not in ('S', 'M', 'C')])
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
else:
raise e
if not reliable: top_3_choices = self.detector.confidence(text)[:3]
self.reliable = False if top_3_choices[0][1] == 0:
reliable, index, top_3_choices = cld2.detect(text, bestEffort=True) return [Language("en", 0)]
return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
if not self.quiet and not reliable:
raise UnknownLanguageError("Try passing a longer snippet of text")
self.languages = [Language(x) for x in top_3_choices]
self.language = self.languages[0]
return self.language
def __str__(self):
text = f"Prediction is reliable: {self.reliable}\n"
text += "\n".join([f"Language {i+1}: {str(l)}"
for i,l in enumerate(self.languages)])
return text

View file

@ -1,7 +1,9 @@
from functools import lru_cache
from argostranslate import translate from argostranslate import translate
from libretranslate.detect import Detector, UnknownLanguageError from libretranslate.detect import Detector
__languages = None __languages = None
@ -13,6 +15,11 @@ def load_languages():
return __languages return __languages
@lru_cache(maxsize=None)
def load_lang_codes():
languages = load_languages()
return tuple(l.code for l in languages)
def detect_languages(text): def detect_languages(text):
# detect batch processing # detect batch processing
if isinstance(text, list): if isinstance(text, list):
@ -21,31 +28,24 @@ def detect_languages(text):
is_batch = False is_batch = False
text = [text] text = [text]
lang_codes = load_lang_codes()
# get the candidates # get the candidates
candidates = [] candidates = []
for t in text: for t in text:
try: try:
d = Detector(t).languages d = Detector(lang_codes).detect(t)
for i in range(len(d)): for i in range(len(d)):
d[i].text_length = len(t) d[i].text_length = len(t)
candidates.extend(d) candidates.extend(d)
except UnknownLanguageError: except Exception as e:
pass print(str(e))
# total read bytes of the provided text # total read bytes of the provided text
text_length_total = sum(c.text_length for c in candidates) text_length_total = sum(c.text_length for c in candidates)
# Load language codes
languages = load_languages()
lang_codes = [l.code for l in languages]
# only use candidates that are supported by argostranslate
candidate_langs = list(
filter(lambda l: l.text_length != 0 and l.code in lang_codes, candidates)
)
# this happens if no language could be detected # this happens if no language could be detected
if not candidate_langs: if not candidates:
# use language "en" by default but with zero confidence # use language "en" by default but with zero confidence
return [{"confidence": 0.0, "language": "en"}] return [{"confidence": 0.0, "language": "en"}]
@ -55,7 +55,7 @@ def detect_languages(text):
temp_average_list = [] temp_average_list = []
for lang_code in lang_codes: for lang_code in lang_codes:
# get all candidates for a specific language # get all candidates for a specific language
lc = list(filter(lambda l: l.code == lang_code, candidate_langs)) lc = list(filter(lambda l: l.code == lang_code, candidates))
if len(lc) > 1: if len(lc) > 1:
# if more than one is present, calculate the average confidence # if more than one is present, calculate the average confidence
lang = lc[0] lang = lc[0]
@ -68,14 +68,14 @@ def detect_languages(text):
if temp_average_list: if temp_average_list:
# replace the list # replace the list
candidate_langs = temp_average_list candidates = temp_average_list
# sort the candidates descending based on the detected confidence # sort the candidates descending based on the detected confidence
candidate_langs.sort( candidates.sort(
key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
) )
return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs] return [{"confidence": l.confidence, "language": l.code} for l in candidates]
def improve_translation_formatting(source, translation, improve_punctuation=True): def improve_translation_formatting(source, translation, improve_punctuation=True):
@ -107,6 +107,9 @@ def improve_translation_formatting(source, translation, improve_punctuation=True
if source.isupper(): if source.isupper():
return translation.upper() return translation.upper()
if len(translation) == 0:
return source
if source[0].islower(): if source[0].islower():
return translation[0].lower() + translation[1:] return translation[0].lower() + translation[1:]

View file

@ -42,7 +42,8 @@ dependencies = [
"Flask-Session ==0.4.0", "Flask-Session ==0.4.0",
"waitress ==2.1.2", "waitress ==2.1.2",
"expiringdict ==1.2.2", "expiringdict ==1.2.2",
" LTpycld2==0.42", "linguars==0.4.0",
"lexilang==1.0.1",
"morfessor ==2.0.6", "morfessor ==2.0.6",
"appdirs ==1.4.4", "appdirs ==1.4.4",
"APScheduler ==3.9.1", "APScheduler ==3.9.1",