use both cdl2 and lingua

This commit is contained in:
Joshix-1 2023-01-18 18:00:00 +00:00
parent 624f19af21
commit 0f70f9a4c4
2 changed files with 51 additions and 19 deletions

View file

@ -1,9 +1,9 @@
# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py
from abc import ABC
from collections.abc import Iterable
import lingua
import pycld2 as cld2
class UnknownLanguage(Exception):
pass
@ -64,7 +64,7 @@ class BaseDetector(ABC):
self.languages: "list[Language]" = [
lang
for lang in self.detect(text)
if lang.code in self.allowed_languages
if lang.code.upper() in self.allowed_languages
]
self.language: "Language | None" = self.languages[0] if self.languages else None
@ -78,8 +78,8 @@ class BaseDetector(ABC):
)
return text
@staticmethod
def supported_languages() -> "list[str]":
@classmethod
def supported_languages(cls) -> "list[str]":
"""Returns a list of the languages that this Detector can detect."""
raise NotImplementedError()
@ -88,13 +88,12 @@ class BaseDetector(ABC):
raise NotImplementedError()
class _DeprecatedCld2Detector(BaseDetector):
class Cld2Detector(BaseDetector):
"""Detect the language used in a snippet of text."""
@staticmethod
def supported_languages() -> "list[str]":
@classmethod
def supported_languages(cls) -> "list[str]":
"""Returns a list of the languages that can be detected by pycld2."""
import pycld2 as cld2
return [
name.capitalize()
for name, code in cld2.LANGUAGES
@ -109,25 +108,23 @@ class _DeprecatedCld2Detector(BaseDetector):
text (string): A snippet of text, the longer it is the more reliable we
can detect the language used to write the text.
"""
import pycld2 as cld2
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
if not reliable:
self.reliable = False
reliable, index, top_3_choices = cld2.detect(text, bestEffort=True)
reliable, index, _top_3_choices = cld2.detect(text, bestEffort=True)
if not self.quiet:
if not reliable:
raise UnknownLanguage("Try passing a longer snippet of text")
return [Language(x) for x in top_3_choices]
top_3_choices = _top_3_choices
return [Language(x) for x in top_3_choices if x[0] != "Unknown"]
class Detector(BaseDetector):
class LinguaDetector(BaseDetector):
"""Detect the language used in a snippet of text."""
@staticmethod
def supported_languages() -> "list[str]":
@classmethod
def supported_languages(cls) -> "list[str]":
"""Returns a list of the languages that can be detected by pycld2."""
return [
lang.iso_code_639_1.name
@ -154,7 +151,42 @@ class Detector(BaseDetector):
]
def test_LinguaDetector():
class Detector(BaseDetector):
@classmethod
def detectors(cls) -> "tuple[type[BaseDetector], ...]":
return Cld2Detector, LinguaDetector
@classmethod
def supported_languages(cls) -> "list[str]":
"""Returns a list of the languages that this Detector can detect."""
languages = set()
for detector in cls.detectors():
languages.update(detector.supported_languages())
return list(languages)
def detect(self, text: str) -> "list[Language]":
"""Decide which language is used to write the text."""
languages = []
failed = 0
for detector in self.detectors():
try:
languages.extend(
detector(
text,
quiet=self.quiet,
allowed_languages=self.allowed_languages,
).languages
)
except UnknownLanguage:
failed += 1
if not self.quiet and failed == len(self.detectors()):
raise UnknownLanguage("Try passing a longer snippet of text")
languages.sort(key=lambda l: l.confidence, reverse=True)
print(languages)
return languages
def test_Detector():
for lang in Detector.supported_languages():
assert len(lang) == 2 and lang.upper() == lang, lang
assert Detector("Neuland").language.code == "DE"
@ -162,6 +194,5 @@ def test_LinguaDetector():
assert Detector("Tout philosophe a deux philosophies : la sienne et celle de Spinoza.").language.code == "FR"
if __name__ == "__main__":
test_LinguaDetector()
test_Detector()

View file

@ -8,6 +8,7 @@ Flask-Session==0.4.0
waitress==2.1.2
expiringdict==1.2.2
lingua-language-detector==1.3.1
LTpycld2==0.42
morfessor==2.0.6
appdirs==1.4.4
APScheduler==3.9.1