LibreTranslate/libretranslate/detect.py

46 lines
1.2 KiB
Python
Raw Normal View History

2023-10-30 04:03:00 +00:00
from functools import lru_cache
2022-12-11 05:45:14 +00:00
2023-10-30 04:20:11 +00:00
import linguars
from lexilang.detector import detect as lldetect
2023-10-30 04:20:11 +00:00
2023-07-09 10:29:11 +00:00
class Language:
2023-10-30 04:03:00 +00:00
def __init__(self, code, confidence):
2022-12-11 05:45:14 +00:00
self.code = code
self.confidence = float(confidence)
def __str__(self):
2023-10-30 04:20:11 +00:00
return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
2023-10-30 04:03:00 +00:00
@lru_cache(maxsize=None)
def load_detector(langcodes = ()):
languages = []
for lc in langcodes:
2023-10-30 17:23:16 +00:00
if lc == 'zt':
continue
2023-10-30 04:03:00 +00:00
try:
languages.append(linguars.Language.from_iso_code_639_1(lc))
2023-10-30 04:20:11 +00:00
except Exception:
2023-10-30 04:09:52 +00:00
print(f"{lc} is not supported by lingua")
2023-10-30 04:03:00 +00:00
pass # Not supported
2023-10-30 04:35:40 +00:00
2023-10-30 04:03:00 +00:00
return linguars.LanguageDetector(languages=languages)
2022-12-11 05:45:14 +00:00
2023-07-09 10:29:11 +00:00
class Detector:
2023-10-30 04:03:00 +00:00
def __init__(self, langcodes = ()):
self.langcodes = langcodes
2023-10-30 04:03:00 +00:00
self.detector = load_detector(langcodes)
2022-12-11 05:45:14 +00:00
def detect(self, text):
if len(text) < 18:
code, conf = lldetect(text, self.langcodes)
if conf > 0:
return [Language(code, round(conf * 100))]
2023-10-30 04:03:00 +00:00
top_3_choices = self.detector.confidence(text)[:3]
if top_3_choices[0][1] == 0:
return [Language("en", 0)]
return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]