create new lingua-based detector that gets used instead of the old one

2024-06-10 09:29:27 +00:00 · 2023-01-17 20:21:37 +01:00 · 2023-01-17 20:21:37 +01:00 · 596b8a9f68
parent df6e63fbe2
commit 596b8a9f68
2 changed files with 50 additions and 4 deletions
--- a/libretranslate/detect.py
+++ b/libretranslate/detect.py
@ -3,7 +3,7 @@
 from abc import ABC
 from collections.abc import Iterable

-import pycld2 as cld2
+import lingua

 class UnknownLanguage(Exception):
    pass
@ -77,13 +77,13 @@ class BaseDetector(ABC):
        raise NotImplementedError()


-class Detector(BaseDetector):
+class _DeprecatedCld2Detector(BaseDetector):
    """Detect the language used in a snippet of text."""

    @staticmethod
    def supported_languages() -> "list[str]":
        """Returns a list of the languages that can be detected by pycld2."""
-        return [name.capitalize() for name, code in cld2.LANGUAGES if not name.startswith("X_")]
+        import pycld2 as cld2
        return [
            name.capitalize()
            for name, code in cld2.LANGUAGES
@ -98,6 +98,7 @@ class Detector(BaseDetector):
          text (string): A snippet of text, the longer it is the more reliable we
                         can detect the language used to write the text.
        """
+        import pycld2 as cld2
        reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)

        if not reliable:
@ -109,3 +110,48 @@ class Detector(BaseDetector):
                    raise UnknownLanguage("Try passing a longer snippet of text")

        return [Language(x) for x in top_3_choices]
+
+
+class Detector(BaseDetector):
+    """Detect the language used in a snippet of text."""
+
+    @staticmethod
+    def supported_languages() -> "list[str]":
+        """Returns a list of the languages that can be detected by pycld2."""
+        return [
+            lang.iso_code_639_1.name
+            for lang in lingua.Language.all()
+        ]
+
+    def detect(self, text: str) -> "list[Language]":
+        """Decide which language is used to write the text.
+        The method tries first to detect the language with high reliability. If
+        that is not possible, the method switches to the best effort strategy.
+        Args:
+          text (string): A snippet of text, the longer it is the more reliable we
+                         can detect the language used to write the text.
+        """
+        languages = [
+            lingua.Language.from_iso_code_639_1(lingua.IsoCode639_1[lang])
+            for lang in self.allowed_languages
+        ]
+        detector = lingua.LanguageDetectorBuilder.from_languages(*languages).build()
+        confidence_values: "list[tuple[lingua.Language, float]]" = detector.compute_language_confidence_values(text)
+
+        return [
+            Language((language.name.title(), language.iso_code_639_1.name, confidence, 0))
+            for language, confidence in confidence_values
+        ]
+
+
+def test_LinguaDetector():
+    for lang in Detector.supported_languages():
+        assert len(lang) == 2 and lang.upper() == lang, lang
+    assert Detector("Neuland").language.code == "DE"
+    # https://github.com/LibreTranslate/LibreTranslate/issues/247
+    assert Detector("Tout philosophe a deux philosophies : la sienne et celle de Spinoza.").language.code == "FR"
+
+
+
+if __name__ == "__main__":
+    test_LinguaDetector()
--- a/requirements.txt
+++ b/requirements.txt
@ -7,7 +7,7 @@ Flask-Babel==2.0.0
 Flask-Session==0.4.0
 waitress==2.1.2
 expiringdict==1.2.2
-LTpycld2==0.42
+lingua-language-detector==1.3.1
 morfessor==2.0.6
 appdirs==1.4.4
 APScheduler==3.9.1