From 310b71c056700fcd55ff1a3022d5abf7e61efa72 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Mon, 11 Dec 2023 17:14:27 -0500 Subject: [PATCH] Workaround for salad --- libretranslate/language.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/libretranslate/language.py b/libretranslate/language.py index 44f926f..407f695 100644 --- a/libretranslate/language.py +++ b/libretranslate/language.py @@ -78,7 +78,7 @@ def detect_languages(text): return [{"confidence": l.confidence, "language": l.code} for l in candidates] -def improve_translation_formatting(source, translation, improve_punctuation=True): +def improve_translation_formatting(source, translation, improve_punctuation=True, remove_single_word_duplicates=True): source = source.strip() if not len(source): @@ -101,6 +101,21 @@ def improve_translation_formatting(source, translation, improve_punctuation=True elif translation_last_char in punctuation_chars: translation = translation[:-1] + # A workaround for certain language models that output + # the single word repeated ad-infinitum (the "salad" bug) + # https://github.com/LibreTranslate/LibreTranslate/issues/46 + if remove_single_word_duplicates: + if len(source) < 20 and source.count(" ") == 0 and translation.count(" ") > 0: + bow = translation.split() + count = {} + for word in bow: + count[word] = count.get(word, 0) + 1 + + for word in count: + if count[word] / len(count) >= 2: + translation = bow[0] + break + if source.islower(): return translation.lower()