Merge pull request #528 from pierotofy/ld

Use langdetect
2024-11-22 07:51:00 +00:00 · 2023-10-30 20:54:35 -04:00 · 2023-10-30 20:54:35 -04:00 · 92df4c3e6b
commit 92df4c3e6b
parent 5033f58e0d 02ea8ae011
3 changed files with 23 additions and 37 deletions
--- a/libretranslate/app.py
+++ b/libretranslate/app.py
@ -554,22 +554,8 @@ def create_app(args):
                )
        if source_lang == "auto":
-            source_langs = []
+            candidate_langs = detect_languages(q if batch else [q])
-            auto_detect_texts = q if batch else [q]
+            source_langs = [candidate_langs[0]]
            overall_candidates = detect_languages(q)
            for text_to_check in auto_detect_texts:
                if len(text_to_check) > 40:
                    candidate_langs = detect_languages(text_to_check)
                else:
                    # Unable to accurately detect languages for short texts
                    candidate_langs = overall_candidates
                source_langs.append(candidate_langs[0])
                if args.debug:
                    print(text_to_check, candidate_langs)
                    print("Auto detected: %s" % candidate_langs[0]["language"])
        else:
            if batch:
                source_langs = [ {"confidence": 100.0, "language": source_lang} for text in q]
--- a/libretranslate/detect.py
+++ b/libretranslate/detect.py
@ -1,6 +1,9 @@
 from functools import lru_cache
-import linguars
+from langdetect import DetectorFactory
 DetectorFactory.seed = 0
 from langdetect import detect_langs
 from lexilang.detector import detect as lldetect
@ -12,34 +15,31 @@ class Language:
  def __str__(self):
    return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
-@lru_cache(maxsize=None)
+def check_lang(langcodes, lang):
-def load_detector(langcodes = ()):
+  return normalized_lang_code(lang) in langcodes
  languages = []
  for lc in langcodes:
    if lc == 'zt':
      continue
    try:
      languages.append(linguars.Language.from_iso_code_639_1(lc))
    except Exception:
      print(f"{lc} is not supported by lingua")
      pass # Not supported
  return linguars.LanguageDetector(languages=languages)
 def normalized_lang_code(lang):
  code = lang.lang
  # Handle zh-cn
  if code.startswith("zh"):
    code = "zh"
  return code
 class Detector:
  def __init__(self, langcodes = ()):
    self.langcodes = langcodes
    self.detector = load_detector(langcodes)
  def detect(self, text):
-    if len(text) < 18:
+    if len(text) < 20:
      code, conf = lldetect(text, self.langcodes)
      if conf > 0:
        return [Language(code, round(conf * 100))]
-    top_3_choices = self.detector.confidence(text)[:3]
+    top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3]
-    if top_3_choices[0][1] == 0:
+    if not len(top_3_choices):
      return [Language("en", 0)]
    if top_3_choices[0].prob == 0:
      return [Language("en", 0)]
-    return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
+
    return [Language(normalized_lang_code(lang), round(lang.prob * 100)) for lang in top_3_choices]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -42,7 +42,7 @@ dependencies = [
    "Flask-Session ==0.4.0",
    "waitress ==2.1.2",
    "expiringdict ==1.2.2",
-    "linguars==0.4.0",
+    "langdetect==1.0.9",
    "lexilang==1.0.1",
    "morfessor ==2.0.6",
    "appdirs ==1.4.4",