Merge pull request #528 from pierotofy/ld

Use langdetect
This commit is contained in:
Piero Toffanin 2023-10-30 20:54:35 -04:00 committed by GitHub
commit 92df4c3e6b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 37 deletions

View file

@ -554,22 +554,8 @@ def create_app(args):
) )
if source_lang == "auto": if source_lang == "auto":
source_langs = [] candidate_langs = detect_languages(q if batch else [q])
auto_detect_texts = q if batch else [q] source_langs = [candidate_langs[0]]
overall_candidates = detect_languages(q)
for text_to_check in auto_detect_texts:
if len(text_to_check) > 40:
candidate_langs = detect_languages(text_to_check)
else:
# Unable to accurately detect languages for short texts
candidate_langs = overall_candidates
source_langs.append(candidate_langs[0])
if args.debug:
print(text_to_check, candidate_langs)
print("Auto detected: %s" % candidate_langs[0]["language"])
else: else:
if batch: if batch:
source_langs = [ {"confidence": 100.0, "language": source_lang} for text in q] source_langs = [ {"confidence": 100.0, "language": source_lang} for text in q]

View file

@ -1,6 +1,9 @@
from functools import lru_cache
import linguars from langdetect import DetectorFactory
DetectorFactory.seed = 0
from langdetect import detect_langs
from lexilang.detector import detect as lldetect from lexilang.detector import detect as lldetect
@ -12,34 +15,31 @@ class Language:
def __str__(self): def __str__(self):
return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ") return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
@lru_cache(maxsize=None) def check_lang(langcodes, lang):
def load_detector(langcodes = ()): return normalized_lang_code(lang) in langcodes
languages = []
for lc in langcodes:
if lc == 'zt':
continue
try:
languages.append(linguars.Language.from_iso_code_639_1(lc))
except Exception:
print(f"{lc} is not supported by lingua")
pass # Not supported
return linguars.LanguageDetector(languages=languages)
def normalized_lang_code(lang):
code = lang.lang
# Handle zh-cn
if code.startswith("zh"):
code = "zh"
return code
class Detector: class Detector:
def __init__(self, langcodes = ()): def __init__(self, langcodes = ()):
self.langcodes = langcodes self.langcodes = langcodes
self.detector = load_detector(langcodes)
def detect(self, text): def detect(self, text):
if len(text) < 18: if len(text) < 20:
code, conf = lldetect(text, self.langcodes) code, conf = lldetect(text, self.langcodes)
if conf > 0: if conf > 0:
return [Language(code, round(conf * 100))] return [Language(code, round(conf * 100))]
top_3_choices = self.detector.confidence(text)[:3] top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3]
if top_3_choices[0][1] == 0: if not len(top_3_choices):
return [Language("en", 0)]
if top_3_choices[0].prob == 0:
return [Language("en", 0)] return [Language("en", 0)]
return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
return [Language(normalized_lang_code(lang), round(lang.prob * 100)) for lang in top_3_choices]

View file

@ -42,7 +42,7 @@ dependencies = [
"Flask-Session ==0.4.0", "Flask-Session ==0.4.0",
"waitress ==2.1.2", "waitress ==2.1.2",
"expiringdict ==1.2.2", "expiringdict ==1.2.2",
"linguars==0.4.0", "langdetect==1.0.9",
"lexilang==1.0.1", "lexilang==1.0.1",
"morfessor ==2.0.6", "morfessor ==2.0.6",
"appdirs ==1.4.4", "appdirs ==1.4.4",