forked from mirrors/LibreTranslate
Merge pull request #122 from PalmerAL/main
Improve language auto-detect for batch requests with multiple languages
This commit is contained in:
commit
db92fa86e5
2 changed files with 44 additions and 23 deletions
52
app/app.py
52
app/app.py
|
@ -360,43 +360,61 @@ def create_app(args):
|
||||||
)
|
)
|
||||||
|
|
||||||
if source_lang == "auto":
|
if source_lang == "auto":
|
||||||
candidate_langs = detect_languages(q)
|
source_langs = []
|
||||||
|
if batch:
|
||||||
|
auto_detect_texts = q
|
||||||
|
else:
|
||||||
|
auto_detect_texts = [q]
|
||||||
|
|
||||||
|
overall_candidates = detect_languages(q)
|
||||||
|
|
||||||
|
for text_to_check in auto_detect_texts:
|
||||||
|
if len(text_to_check) > 40:
|
||||||
|
candidate_langs = detect_languages(text_to_check)
|
||||||
|
else:
|
||||||
|
# Unable to accurately detect languages for short texts
|
||||||
|
candidate_langs = overall_candidates
|
||||||
|
source_langs.append(candidate_langs[0]["language"])
|
||||||
|
|
||||||
if args.debug:
|
if args.debug:
|
||||||
print(candidate_langs)
|
print(text_to_check, candidate_langs)
|
||||||
|
print("Auto detected: %s" % candidate_langs[0]["language"])
|
||||||
|
else:
|
||||||
|
if batch:
|
||||||
|
source_langs = [source_lang for text in q]
|
||||||
|
else:
|
||||||
|
source_langs = [source_lang]
|
||||||
|
|
||||||
source_lang = candidate_langs[0]["language"]
|
src_langs = [next(iter([l for l in languages if l.code == source_lang]), None) for source_lang in source_langs]
|
||||||
|
|
||||||
if args.debug:
|
for idx, lang in enumerate(src_langs):
|
||||||
print("Auto detected: %s" % source_lang)
|
if lang is None:
|
||||||
|
abort(400, description="%s is not supported" % source_langs[idx])
|
||||||
|
|
||||||
src_lang = next(iter([l for l in languages if l.code == source_lang]), None)
|
|
||||||
tgt_lang = next(iter([l for l in languages if l.code == target_lang]), None)
|
tgt_lang = next(iter([l for l in languages if l.code == target_lang]), None)
|
||||||
|
|
||||||
if src_lang is None:
|
|
||||||
abort(400, description="%s is not supported" % source_lang)
|
|
||||||
if tgt_lang is None:
|
if tgt_lang is None:
|
||||||
abort(400, description="%s is not supported" % target_lang)
|
abort(400, description="%s is not supported" % target_lang)
|
||||||
|
|
||||||
translator = src_lang.get_translation(tgt_lang)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if batch:
|
if batch:
|
||||||
|
results = []
|
||||||
|
for idx, text in enumerate(q):
|
||||||
|
translator = src_langs[idx].get_translation(tgt_lang)
|
||||||
|
results.append(translator.translate(
|
||||||
|
transliterate(text, target_lang=source_langs[idx])
|
||||||
|
))
|
||||||
return jsonify(
|
return jsonify(
|
||||||
{
|
{
|
||||||
"translatedText": [
|
"translatedText": results
|
||||||
translator.translate(
|
|
||||||
transliterate(text, target_lang=source_lang)
|
|
||||||
)
|
|
||||||
for text in q
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
translator = src_langs[0].get_translation(tgt_lang)
|
||||||
return jsonify(
|
return jsonify(
|
||||||
{
|
{
|
||||||
"translatedText": translator.translate(
|
"translatedText": translator.translate(
|
||||||
transliterate(q, target_lang=source_lang)
|
transliterate(q, target_lang=source_langs[0])
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
@ -22,16 +22,19 @@ def detect_languages(text):
|
||||||
candidates = []
|
candidates = []
|
||||||
for t in text:
|
for t in text:
|
||||||
try:
|
try:
|
||||||
candidates.extend(Detector(t).languages)
|
d = Detector(t).languages
|
||||||
|
for i in range(len(d)):
|
||||||
|
d[i].text_length = len(t)
|
||||||
|
candidates.extend(d)
|
||||||
except UnknownLanguage:
|
except UnknownLanguage:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# total read bytes of the provided text
|
# total read bytes of the provided text
|
||||||
read_bytes_total = sum(c.read_bytes for c in candidates)
|
text_length_total = sum(c.text_length for c in candidates)
|
||||||
|
|
||||||
# only use candidates that are supported by argostranslate
|
# only use candidates that are supported by argostranslate
|
||||||
candidate_langs = list(
|
candidate_langs = list(
|
||||||
filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates)
|
filter(lambda l: l.text_length != 0 and l.code in __lang_codes, candidates)
|
||||||
)
|
)
|
||||||
|
|
||||||
# this happens if no language could be detected
|
# this happens if no language could be detected
|
||||||
|
@ -50,7 +53,7 @@ def detect_languages(text):
|
||||||
# if more than one is present, calculate the average confidence
|
# if more than one is present, calculate the average confidence
|
||||||
lang = lc[0]
|
lang = lc[0]
|
||||||
lang.confidence = sum(l.confidence for l in lc) / len(lc)
|
lang.confidence = sum(l.confidence for l in lc) / len(lc)
|
||||||
lang.read_bytes = sum(l.read_bytes for l in lc)
|
lang.text_length = sum(l.text_length for l in lc)
|
||||||
temp_average_list.append(lang)
|
temp_average_list.append(lang)
|
||||||
elif lc:
|
elif lc:
|
||||||
# otherwise just add it to the temporary list
|
# otherwise just add it to the temporary list
|
||||||
|
@ -62,7 +65,7 @@ def detect_languages(text):
|
||||||
|
|
||||||
# sort the candidates descending based on the detected confidence
|
# sort the candidates descending based on the detected confidence
|
||||||
candidate_langs.sort(
|
candidate_langs.sort(
|
||||||
key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True
|
key=lambda l: (l.confidence * l.text_length) / text_length_total, reverse=True
|
||||||
)
|
)
|
||||||
|
|
||||||
return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]
|
return [{"confidence": l.confidence, "language": l.code} for l in candidate_langs]
|
||||||
|
|
Loading…
Reference in a new issue