Add lexilang for language detection on short texts

This commit is contained in:
Piero Toffanin 2023-10-30 12:52:33 -04:00
parent c9592a236a
commit 6ff5bba000
3 changed files with 9 additions and 1 deletions

View file

@ -1,6 +1,7 @@
from functools import lru_cache from functools import lru_cache
import linguars import linguars
from lexilang.detector import detect as lldetect
class Language: class Language:
@ -26,9 +27,15 @@ def load_detector(langcodes = ()):
class Detector: class Detector:
def __init__(self, langcodes = ()): def __init__(self, langcodes = ()):
self.langcodes = langcodes
self.detector = load_detector(langcodes) self.detector = load_detector(langcodes)
def detect(self, text): def detect(self, text):
if len(text) < 18:
code, conf = lldetect(text, self.langcodes)
if conf > 0:
return [Language(code, round(conf * 100))]
top_3_choices = self.detector.confidence(text)[:3] top_3_choices = self.detector.confidence(text)[:3]
if top_3_choices[0][1] == 0: if top_3_choices[0][1] == 0:
return [Language("en", 0)] return [Language("en", 0)]

View file

@ -18,7 +18,7 @@ def load_languages():
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def load_lang_codes(): def load_lang_codes():
languages = load_languages() languages = load_languages()
return (l.code for l in languages) return tuple(l.code for l in languages)
def detect_languages(text): def detect_languages(text):
# detect batch processing # detect batch processing

View file

@ -43,6 +43,7 @@ dependencies = [
"waitress ==2.1.2", "waitress ==2.1.2",
"expiringdict ==1.2.2", "expiringdict ==1.2.2",
"linguars==0.4.0", "linguars==0.4.0",
"lexilang==1.0.1",
"morfessor ==2.0.6", "morfessor ==2.0.6",
"appdirs ==1.4.4", "appdirs ==1.4.4",
"APScheduler ==3.9.1", "APScheduler ==3.9.1",