From 6c5fa2a4ee5d81a3804c2bba5e168f0384906920 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 00:03:00 -0400 Subject: [PATCH 1/7] Use lingua for language detection --- libretranslate/detect.py | 93 ++++++++++---------------------------- libretranslate/language.py | 33 +++++++------- pyproject.toml | 2 +- 3 files changed, 40 insertions(+), 88 deletions(-) diff --git a/libretranslate/detect.py b/libretranslate/detect.py index 5483935..5aee048 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -1,83 +1,36 @@ -# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py - -import unicodedata - -import pycld2 as cld2 - - -class UnknownLanguageError(Exception): - pass +import linguars +from functools import lru_cache class Language: - def __init__(self, choice): - name, code, confidence, bytesize = choice + def __init__(self, code, confidence): self.code = code - self.name = name self.confidence = float(confidence) - self.read_bytes = int(bytesize) def __str__(self): - return ("name: {:<12}code: {:<9}confidence: {:>5.1f} " - "read bytes:{:>6}".format(self.name, self.code, - self.confidence, self.read_bytes)) + return ("code: {:<9} confidence: {:>5.1f} ".format( + self.code, + self.confidence)) - @staticmethod - def from_code(code): - return Language(("", code, 100, 0)) +@lru_cache(maxsize=None) +def load_detector(langcodes = ()): + languages = [] + for lc in langcodes: + try: + languages.append(linguars.Language.from_iso_code_639_1(lc)) + except: + pass # Not supported + + return linguars.LanguageDetector(languages=languages) class Detector: - """ Detect the language used in a snippet of text.""" - - def __init__(self, text, quiet=False): - """ Detector of the language used in `text`. - Args: - text (string): unicode string. - """ - self.__text = text - self.reliable = True - """False if the detector used Best Effort strategy in detection.""" - self.quiet = quiet - """If true, exceptions will be silenced.""" - self.detect(text) - - @staticmethod - def supported_languages(): - """Returns a list of the languages that can be detected by pycld2.""" - return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")] + def __init__(self, langcodes = ()): + self.detector = load_detector(langcodes) def detect(self, text): - """Decide which language is used to write the text. - The method tries first to detect the language with high reliability. If - that is not possible, the method switches to best effort strategy. - Args: - text (string): A snippet of text, the longer it is the more reliable we - can detect the language used to write the text. - """ - try: - reliable, index, top_3_choices = cld2.detect(text, bestEffort=False) - except cld2.error as e: - if "input contains invalid UTF-8" in str(e): - # Fix for https://github.com/LibreTranslate/LibreTranslate/issues/514 - # related to https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790 - text = ''.join([l for l in text if unicodedata.category(str(l))[0] not in ('S', 'M', 'C')]) - reliable, index, top_3_choices = cld2.detect(text, bestEffort=False) - else: - raise e + top_3_choices = self.detector.confidence(text)[:3] + print(top_3_choices) + if top_3_choices[0][1] == 0: + return [Language("en", 0)] + return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices] - if not reliable: - self.reliable = False - reliable, index, top_3_choices = cld2.detect(text, bestEffort=True) - - if not self.quiet and not reliable: - raise UnknownLanguageError("Try passing a longer snippet of text") - - self.languages = [Language(x) for x in top_3_choices] - self.language = self.languages[0] - return self.language - - def __str__(self): - text = f"Prediction is reliable: {self.reliable}\n" - text += "\n".join([f"Language {i+1}: {str(l)}" - for i,l in enumerate(self.languages)]) - return text diff --git a/libretranslate/language.py b/libretranslate/language.py index 3ddfcda..6cebb95 100644 --- a/libretranslate/language.py +++ b/libretranslate/language.py @@ -1,7 +1,8 @@ from argostranslate import translate +from functools import lru_cache -from libretranslate.detect import Detector, UnknownLanguageError +from libretranslate.detect import Detector __languages = None @@ -13,6 +14,11 @@ def load_languages(): return __languages +@lru_cache(maxsize=None) +def load_lang_codes(): + languages = load_languages() + return (l.code for l in languages) + def detect_languages(text): # detect batch processing if isinstance(text, list): @@ -21,31 +27,24 @@ def detect_languages(text): is_batch = False text = [text] + lang_codes = load_lang_codes() + # get the candidates candidates = [] for t in text: try: - d = Detector(t).languages + d = Detector(lang_codes).detect(t) for i in range(len(d)): d[i].text_length = len(t) candidates.extend(d) - except UnknownLanguageError: + except: pass # total read bytes of the provided text text_length_total = sum(c.text_length for c in candidates) - # Load language codes - languages = load_languages() - lang_codes = [l.code for l in languages] - - # only use candidates that are supported by argostranslate - candidate_langs = list( - filter(lambda l: l.text_length != 0 and l.code in lang_codes, candidates) - ) - # this happens if no language could be detected - if not candidate_langs: + if not candidates: # use language "en" by default but with zero confidence return [{"confidence": 0.0, "language": "en"}] @@ -55,7 +54,7 @@ def detect_languages(text): temp_average_list = [] for lang_code in lang_codes: # get all candidates for a specific language - lc = list(filter(lambda l: l.code == lang_code, candidate_langs)) + lc = list(filter(lambda l: l.code == lang_code, candidates)) if len(lc) > 1: # if more than one is present, calculate the average confidence lang = lc[0] @@ -68,14 +67,14 @@ def detect_languages(text): if temp_average_list: # replace the list - candidate_langs = temp_average_list + candidates = temp_average_list # sort the candidates descending based on the detected confidence - candidate_langs.sort( + candidates.sort( key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True ) - return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs] + return [{"confidence": l.confidence, "language": l.code} for l in candidates] def improve_translation_formatting(source, translation, improve_punctuation=True): diff --git a/pyproject.toml b/pyproject.toml index bd79bdd..f4d3459 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dependencies = [ "Flask-Session ==0.4.0", "waitress ==2.1.2", "expiringdict ==1.2.2", - " LTpycld2==0.42", + "linguars==0.4.0", "morfessor ==2.0.6", "appdirs ==1.4.4", "APScheduler ==3.9.1", From 522b8b03d7996716de97fdd923b7f4fe686e2eab Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 00:09:52 -0400 Subject: [PATCH 2/7] Fix some warnings --- libretranslate/detect.py | 4 ++-- libretranslate/language.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libretranslate/detect.py b/libretranslate/detect.py index 5aee048..559aea8 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -17,7 +17,8 @@ def load_detector(langcodes = ()): for lc in langcodes: try: languages.append(linguars.Language.from_iso_code_639_1(lc)) - except: + except Exception as e: + print(f"{lc} is not supported by lingua") pass # Not supported return linguars.LanguageDetector(languages=languages) @@ -29,7 +30,6 @@ class Detector: def detect(self, text): top_3_choices = self.detector.confidence(text)[:3] - print(top_3_choices) if top_3_choices[0][1] == 0: return [Language("en", 0)] return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices] diff --git a/libretranslate/language.py b/libretranslate/language.py index 6cebb95..cc647b1 100644 --- a/libretranslate/language.py +++ b/libretranslate/language.py @@ -37,8 +37,8 @@ def detect_languages(text): for i in range(len(d)): d[i].text_length = len(t) candidates.extend(d) - except: - pass + except Exception as e: + print(str(e)) # total read bytes of the provided text text_length_total = sum(c.text_length for c in candidates) From 3fd2abc1f9b614c292023bfb15d9cd6692473c49 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 00:20:11 -0400 Subject: [PATCH 3/7] Run ruff --- libretranslate/detect.py | 10 +++++----- libretranslate/language.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/libretranslate/detect.py b/libretranslate/detect.py index 559aea8..b1bafa8 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -1,15 +1,15 @@ -import linguars from functools import lru_cache +import linguars + + class Language: def __init__(self, code, confidence): self.code = code self.confidence = float(confidence) def __str__(self): - return ("code: {:<9} confidence: {:>5.1f} ".format( - self.code, - self.confidence)) + return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ") @lru_cache(maxsize=None) def load_detector(langcodes = ()): @@ -17,7 +17,7 @@ def load_detector(langcodes = ()): for lc in langcodes: try: languages.append(linguars.Language.from_iso_code_639_1(lc)) - except Exception as e: + except Exception: print(f"{lc} is not supported by lingua") pass # Not supported diff --git a/libretranslate/language.py b/libretranslate/language.py index cc647b1..7d1ea2c 100644 --- a/libretranslate/language.py +++ b/libretranslate/language.py @@ -1,6 +1,7 @@ +from functools import lru_cache + from argostranslate import translate -from functools import lru_cache from libretranslate.detect import Detector From c9592a236a94d9a4871d89c4f639da277727c1f4 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 00:35:40 -0400 Subject: [PATCH 4/7] Fix trailing whitespace --- libretranslate/detect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libretranslate/detect.py b/libretranslate/detect.py index b1bafa8..b4ec9b5 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -20,7 +20,7 @@ def load_detector(langcodes = ()): except Exception: print(f"{lc} is not supported by lingua") pass # Not supported - + return linguars.LanguageDetector(languages=languages) From 6ff5bba000f6fab3698e6b81c7fbc9dfdcff9fe2 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 12:52:33 -0400 Subject: [PATCH 5/7] Add lexilang for language detection on short texts --- libretranslate/detect.py | 7 +++++++ libretranslate/language.py | 2 +- pyproject.toml | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/libretranslate/detect.py b/libretranslate/detect.py index b4ec9b5..a0b15b2 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -1,6 +1,7 @@ from functools import lru_cache import linguars +from lexilang.detector import detect as lldetect class Language: @@ -26,9 +27,15 @@ def load_detector(langcodes = ()): class Detector: def __init__(self, langcodes = ()): + self.langcodes = langcodes self.detector = load_detector(langcodes) def detect(self, text): + if len(text) < 18: + code, conf = lldetect(text, self.langcodes) + if conf > 0: + return [Language(code, round(conf * 100))] + top_3_choices = self.detector.confidence(text)[:3] if top_3_choices[0][1] == 0: return [Language("en", 0)] diff --git a/libretranslate/language.py b/libretranslate/language.py index 7d1ea2c..86921bc 100644 --- a/libretranslate/language.py +++ b/libretranslate/language.py @@ -18,7 +18,7 @@ def load_languages(): @lru_cache(maxsize=None) def load_lang_codes(): languages = load_languages() - return (l.code for l in languages) + return tuple(l.code for l in languages) def detect_languages(text): # detect batch processing diff --git a/pyproject.toml b/pyproject.toml index f4d3459..3e200cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "waitress ==2.1.2", "expiringdict ==1.2.2", "linguars==0.4.0", + "lexilang==1.0.1", "morfessor ==2.0.6", "appdirs ==1.4.4", "APScheduler ==3.9.1", From a9bff7929c660725c609cf7c63cde0100fb4bacb Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 12:53:03 -0400 Subject: [PATCH 6/7] Bump version --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 347f583..bc80560 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.4.1 +1.5.0 From 2972292fc5f6f692911aa81c2b0245557bdd0486 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 13:09:39 -0400 Subject: [PATCH 7/7] Fix string index out of range fault --- libretranslate/app.py | 1 + libretranslate/language.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/libretranslate/app.py b/libretranslate/app.py index a5a77a8..c888861 100644 --- a/libretranslate/app.py +++ b/libretranslate/app.py @@ -644,6 +644,7 @@ def create_app(args): } ) except Exception as e: + raise e abort(500, description=_("Cannot translate text: %(text)s", text=str(e))) @bp.post("/translate_file") diff --git a/libretranslate/language.py b/libretranslate/language.py index 86921bc..44f926f 100644 --- a/libretranslate/language.py +++ b/libretranslate/language.py @@ -107,6 +107,9 @@ def improve_translation_formatting(source, translation, improve_punctuation=True if source.isupper(): return translation.upper() + if len(translation) == 0: + return source + if source[0].islower(): return translation[0].lower() + translation[1:]