create new lingua-based detector that gets used instead of the old one

This commit is contained in:
Joshix 2023-01-17 20:21:37 +01:00
parent df6e63fbe2
commit 596b8a9f68
2 changed files with 50 additions and 4 deletions

View file

@ -3,7 +3,7 @@
from abc import ABC
from collections.abc import Iterable
import pycld2 as cld2
import lingua
class UnknownLanguage(Exception):
pass
@ -77,13 +77,13 @@ class BaseDetector(ABC):
raise NotImplementedError()
class Detector(BaseDetector):
class _DeprecatedCld2Detector(BaseDetector):
"""Detect the language used in a snippet of text."""
@staticmethod
def supported_languages() -> "list[str]":
"""Returns a list of the languages that can be detected by pycld2."""
return [name.capitalize() for name, code in cld2.LANGUAGES if not name.startswith("X_")]
import pycld2 as cld2
return [
name.capitalize()
for name, code in cld2.LANGUAGES
@ -98,6 +98,7 @@ class Detector(BaseDetector):
text (string): A snippet of text, the longer it is the more reliable we
can detect the language used to write the text.
"""
import pycld2 as cld2
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
if not reliable:
@ -109,3 +110,48 @@ class Detector(BaseDetector):
raise UnknownLanguage("Try passing a longer snippet of text")
return [Language(x) for x in top_3_choices]
class Detector(BaseDetector):
"""Detect the language used in a snippet of text."""
@staticmethod
def supported_languages() -> "list[str]":
"""Returns a list of the languages that can be detected by pycld2."""
return [
lang.iso_code_639_1.name
for lang in lingua.Language.all()
]
def detect(self, text: str) -> "list[Language]":
"""Decide which language is used to write the text.
The method tries first to detect the language with high reliability. If
that is not possible, the method switches to the best effort strategy.
Args:
text (string): A snippet of text, the longer it is the more reliable we
can detect the language used to write the text.
"""
languages = [
lingua.Language.from_iso_code_639_1(lingua.IsoCode639_1[lang])
for lang in self.allowed_languages
]
detector = lingua.LanguageDetectorBuilder.from_languages(*languages).build()
confidence_values: "list[tuple[lingua.Language, float]]" = detector.compute_language_confidence_values(text)
return [
Language((language.name.title(), language.iso_code_639_1.name, confidence, 0))
for language, confidence in confidence_values
]
def test_LinguaDetector():
for lang in Detector.supported_languages():
assert len(lang) == 2 and lang.upper() == lang, lang
assert Detector("Neuland").language.code == "DE"
# https://github.com/LibreTranslate/LibreTranslate/issues/247
assert Detector("Tout philosophe a deux philosophies : la sienne et celle de Spinoza.").language.code == "FR"
if __name__ == "__main__":
test_LinguaDetector()

View file

@ -7,7 +7,7 @@ Flask-Babel==2.0.0
Flask-Session==0.4.0
waitress==2.1.2
expiringdict==1.2.2
LTpycld2==0.42
lingua-language-detector==1.3.1
morfessor==2.0.6
appdirs==1.4.4
APScheduler==3.9.1