2022-12-11 05:45:14 +00:00
|
|
|
|
2023-10-31 00:39:30 +00:00
|
|
|
from langdetect import DetectorFactory
|
|
|
|
|
|
|
|
DetectorFactory.seed = 0
|
|
|
|
|
|
|
|
from langdetect import detect_langs
|
2023-10-30 16:52:33 +00:00
|
|
|
from lexilang.detector import detect as lldetect
|
2023-10-30 04:20:11 +00:00
|
|
|
|
|
|
|
|
2023-07-09 10:29:11 +00:00
|
|
|
class Language:
|
2023-10-30 04:03:00 +00:00
|
|
|
def __init__(self, code, confidence):
|
2022-12-11 05:45:14 +00:00
|
|
|
self.code = code
|
|
|
|
self.confidence = float(confidence)
|
|
|
|
|
|
|
|
def __str__(self):
|
2023-10-30 04:20:11 +00:00
|
|
|
return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
|
2023-10-30 04:03:00 +00:00
|
|
|
|
2023-10-31 00:39:30 +00:00
|
|
|
def check_lang(langcodes, lang):
|
|
|
|
return normalized_lang_code(lang) in langcodes
|
2022-12-11 05:45:14 +00:00
|
|
|
|
2023-10-31 00:39:30 +00:00
|
|
|
def normalized_lang_code(lang):
|
|
|
|
code = lang.lang
|
|
|
|
# Handle zh-cn
|
|
|
|
if code.startswith("zh"):
|
|
|
|
code = "zh"
|
|
|
|
return code
|
2022-12-11 05:45:14 +00:00
|
|
|
|
2023-07-09 10:29:11 +00:00
|
|
|
class Detector:
|
2023-10-30 04:03:00 +00:00
|
|
|
def __init__(self, langcodes = ()):
|
2023-10-30 16:52:33 +00:00
|
|
|
self.langcodes = langcodes
|
2022-12-11 05:45:14 +00:00
|
|
|
|
|
|
|
def detect(self, text):
|
2023-10-31 00:39:30 +00:00
|
|
|
if len(text) < 20:
|
2023-10-30 16:52:33 +00:00
|
|
|
code, conf = lldetect(text, self.langcodes)
|
|
|
|
if conf > 0:
|
|
|
|
return [Language(code, round(conf * 100))]
|
|
|
|
|
2023-10-31 00:39:30 +00:00
|
|
|
top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3]
|
|
|
|
if not len(top_3_choices):
|
2023-10-30 04:03:00 +00:00
|
|
|
return [Language("en", 0)]
|
2023-10-31 00:39:30 +00:00
|
|
|
if top_3_choices[0].prob == 0:
|
|
|
|
return [Language("en", 0)]
|
|
|
|
|
|
|
|
return [Language(normalized_lang_code(lang), round(lang.prob * 100)) for lang in top_3_choices]
|
2023-07-09 10:38:03 +00:00
|
|
|
|