Merge pull request #122 from PalmerAL/main

Improve language auto-detect for batch requests with multiple languages
This commit is contained in:
Piero Toffanin 2021-08-02 09:04:19 -05:00 committed by GitHub
commit db92fa86e5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 44 additions and 23 deletions

View file

@ -360,43 +360,61 @@ def create_app(args):
) )
if source_lang == "auto": if source_lang == "auto":
candidate_langs = detect_languages(q) source_langs = []
if batch:
auto_detect_texts = q
else:
auto_detect_texts = [q]
if args.debug: overall_candidates = detect_languages(q)
print(candidate_langs)
source_lang = candidate_langs[0]["language"] for text_to_check in auto_detect_texts:
if len(text_to_check) > 40:
candidate_langs = detect_languages(text_to_check)
else:
# Unable to accurately detect languages for short texts
candidate_langs = overall_candidates
source_langs.append(candidate_langs[0]["language"])
if args.debug: if args.debug:
print("Auto detected: %s" % source_lang) print(text_to_check, candidate_langs)
print("Auto detected: %s" % candidate_langs[0]["language"])
else:
if batch:
source_langs = [source_lang for text in q]
else:
source_langs = [source_lang]
src_langs = [next(iter([l for l in languages if l.code == source_lang]), None) for source_lang in source_langs]
for idx, lang in enumerate(src_langs):
if lang is None:
abort(400, description="%s is not supported" % source_langs[idx])
src_lang = next(iter([l for l in languages if l.code == source_lang]), None)
tgt_lang = next(iter([l for l in languages if l.code == target_lang]), None) tgt_lang = next(iter([l for l in languages if l.code == target_lang]), None)
if src_lang is None:
abort(400, description="%s is not supported" % source_lang)
if tgt_lang is None: if tgt_lang is None:
abort(400, description="%s is not supported" % target_lang) abort(400, description="%s is not supported" % target_lang)
translator = src_lang.get_translation(tgt_lang)
try: try:
if batch: if batch:
results = []
for idx, text in enumerate(q):
translator = src_langs[idx].get_translation(tgt_lang)
results.append(translator.translate(
transliterate(text, target_lang=source_langs[idx])
))
return jsonify( return jsonify(
{ {
"translatedText": [ "translatedText": results
translator.translate(
transliterate(text, target_lang=source_lang)
)
for text in q
]
} }
) )
else: else:
translator = src_langs[0].get_translation(tgt_lang)
return jsonify( return jsonify(
{ {
"translatedText": translator.translate( "translatedText": translator.translate(
transliterate(q, target_lang=source_lang) transliterate(q, target_lang=source_langs[0])
) )
} }
) )

View file

@ -22,16 +22,19 @@ def detect_languages(text):
candidates = [] candidates = []
for t in text: for t in text:
try: try:
candidates.extend(Detector(t).languages) d = Detector(t).languages
for i in range(len(d)):
d[i].text_length = len(t)
candidates.extend(d)
except UnknownLanguage: except UnknownLanguage:
pass pass
# total read bytes of the provided text # total read bytes of the provided text
read_bytes_total = sum(c.read_bytes for c in candidates) text_length_total = sum(c.text_length for c in candidates)
# only use candidates that are supported by argostranslate # only use candidates that are supported by argostranslate
candidate_langs = list( candidate_langs = list(
filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates) filter(lambda l: l.text_length != 0 and l.code in __lang_codes, candidates)
) )
# this happens if no language could be detected # this happens if no language could be detected
@ -50,7 +53,7 @@ def detect_languages(text):
# if more than one is present, calculate the average confidence # if more than one is present, calculate the average confidence
lang = lc[0] lang = lc[0]
lang.confidence = sum(l.confidence for l in lc) / len(lc) lang.confidence = sum(l.confidence for l in lc) / len(lc)
lang.read_bytes = sum(l.read_bytes for l in lc) lang.text_length = sum(l.text_length for l in lc)
temp_average_list.append(lang) temp_average_list.append(lang)
elif lc: elif lc:
# otherwise just add it to the temporary list # otherwise just add it to the temporary list
@ -62,7 +65,7 @@ def detect_languages(text):
# sort the candidates descending based on the detected confidence # sort the candidates descending based on the detected confidence
candidate_langs.sort( candidate_langs.sort(
key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
) )
return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs] return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]