From 02ea8ae01125246269bc3fe643711c3de5f5f7d3 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 30 Oct 2023 20:39:30 -0400 Subject: [PATCH] Switch linguars for langdetect --- libretranslate/app.py | 18 ++---------------- libretranslate/detect.py | 40 ++++++++++++++++++++-------------------- pyproject.toml | 2 +- 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/libretranslate/app.py b/libretranslate/app.py index c888861..eb4fafe 100644 --- a/libretranslate/app.py +++ b/libretranslate/app.py @@ -554,22 +554,8 @@ def create_app(args): ) if source_lang == "auto": - source_langs = [] - auto_detect_texts = q if batch else [q] - - overall_candidates = detect_languages(q) - - for text_to_check in auto_detect_texts: - if len(text_to_check) > 40: - candidate_langs = detect_languages(text_to_check) - else: - # Unable to accurately detect languages for short texts - candidate_langs = overall_candidates - source_langs.append(candidate_langs[0]) - - if args.debug: - print(text_to_check, candidate_langs) - print("Auto detected: %s" % candidate_langs[0]["language"]) + candidate_langs = detect_languages(q if batch else [q]) + source_langs = [candidate_langs[0]] else: if batch: source_langs = [ {"confidence": 100.0, "language": source_lang} for text in q] diff --git a/libretranslate/detect.py b/libretranslate/detect.py index 2fd1d6e..b13d52c 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -1,6 +1,9 @@ -from functools import lru_cache -import linguars +from langdetect import DetectorFactory + +DetectorFactory.seed = 0 + +from langdetect import detect_langs from lexilang.detector import detect as lldetect @@ -12,34 +15,31 @@ class Language: def __str__(self): return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ") -@lru_cache(maxsize=None) -def load_detector(langcodes = ()): - languages = [] - for lc in langcodes: - if lc == 'zt': - continue - try: - languages.append(linguars.Language.from_iso_code_639_1(lc)) - except Exception: - print(f"{lc} is not supported by lingua") - pass # Not supported - - return linguars.LanguageDetector(languages=languages) +def check_lang(langcodes, lang): + return normalized_lang_code(lang) in langcodes +def normalized_lang_code(lang): + code = lang.lang + # Handle zh-cn + if code.startswith("zh"): + code = "zh" + return code class Detector: def __init__(self, langcodes = ()): self.langcodes = langcodes - self.detector = load_detector(langcodes) def detect(self, text): - if len(text) < 18: + if len(text) < 20: code, conf = lldetect(text, self.langcodes) if conf > 0: return [Language(code, round(conf * 100))] - top_3_choices = self.detector.confidence(text)[:3] - if top_3_choices[0][1] == 0: + top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3] + if not len(top_3_choices): + return [Language("en", 0)] + if top_3_choices[0].prob == 0: return [Language("en", 0)] - return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices] + + return [Language(normalized_lang_code(lang), round(lang.prob * 100)) for lang in top_3_choices] diff --git a/pyproject.toml b/pyproject.toml index 3e200cf..38d80b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dependencies = [ "Flask-Session ==0.4.0", "waitress ==2.1.2", "expiringdict ==1.2.2", - "linguars==0.4.0", + "langdetect==1.0.9", "lexilang==1.0.1", "morfessor ==2.0.6", "appdirs ==1.4.4",