From c29cecbb6397caaf2f60e314a4903ff0d01cace7 Mon Sep 17 00:00:00 2001 From: PalmerAL Date: Mon, 2 Aug 2021 00:06:56 -0500 Subject: [PATCH] improve auto-detect for batch requests with multiple languages --- app/app.py | 54 ++++++++++++++++++++++++++++++++----------------- app/language.py | 13 +++++++----- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/app/app.py b/app/app.py index c21a4e0..1bcc8c7 100644 --- a/app/app.py +++ b/app/app.py @@ -360,43 +360,61 @@ def create_app(args): ) if source_lang == "auto": - candidate_langs = detect_languages(q) + source_langs = [] + if batch: + auto_detect_texts = q + else: + auto_detect_texts = [q] - if args.debug: - print(candidate_langs) + overall_candidates = detect_languages(q) + + for text_to_check in auto_detect_texts: + if len(text_to_check) > 40: + candidate_langs = detect_languages(text_to_check) + else: + # Unable to accurately detect languages for short texts + candidate_langs = overall_candidates + source_langs.append(candidate_langs[0]["language"]) - source_lang = candidate_langs[0]["language"] + if args.debug: + print(text_to_check, candidate_langs) + print("Auto detected: %s" % candidate_langs[0]["language"]) + else: + if batch: + source_langs = [source_lang for text in q] + else: + source_langs = [source_lang] - if args.debug: - print("Auto detected: %s" % source_lang) + src_langs = [next(iter([l for l in languages if l.code == source_lang]), None) for source_lang in source_langs] + + for idx, lang in enumerate(src_langs): + if lang is None: + abort(400, description="%s is not supported" % source_langs[idx]) - src_lang = next(iter([l for l in languages if l.code == source_lang]), None) tgt_lang = next(iter([l for l in languages if l.code == target_lang]), None) - if src_lang is None: - abort(400, description="%s is not supported" % source_lang) if tgt_lang is None: abort(400, description="%s is not supported" % target_lang) - translator = src_lang.get_translation(tgt_lang) - try: if batch: + results = [] + for idx, text in enumerate(q): + translator = src_langs[idx].get_translation(tgt_lang) + results.append(translator.translate( + transliterate(text, target_lang=source_langs[idx]) + )) return jsonify( { - "translatedText": [ - translator.translate( - transliterate(text, target_lang=source_lang) - ) - for text in q - ] + "translatedText": results } ) else: + translator = src_langs[0].get_translation(tgt_lang) return jsonify( { "translatedText": translator.translate( - transliterate(q, target_lang=source_lang) + transliterate(q, target_lang=source_langs[0]) ) } ) diff --git a/app/language.py b/app/language.py index 1b35812..868eab8 100644 --- a/app/language.py +++ b/app/language.py @@ -22,16 +22,19 @@ def detect_languages(text): candidates = [] for t in text: try: - candidates.extend(Detector(t).languages) + d = Detector(t).languages + for i in range(len(d)): + d[i].text_length = len(t) + candidates.extend(d) except UnknownLanguage: pass # total read bytes of the provided text - read_bytes_total = sum(c.read_bytes for c in candidates) + text_length_total = sum(c.text_length for c in candidates) # only use candidates that are supported by argostranslate candidate_langs = list( - filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates) + filter(lambda l: l.text_length != 0 and l.code in __lang_codes, candidates) ) # this happens if no language could be detected @@ -50,7 +53,7 @@ def detect_languages(text): # if more than one is present, calculate the average confidence lang = lc[0] lang.confidence = sum(l.confidence for l in lc) / len(lc) - lang.read_bytes = sum(l.read_bytes for l in lc) + lang.text_length = sum(l.text_length for l in lc) temp_average_list.append(lang) elif lc: # otherwise just add it to the temporary list @@ -62,7 +65,7 @@ def detect_languages(text): # sort the candidates descending based on the detected confidence candidate_langs.sort( - key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True + key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True ) return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]