Workaround for salad

This commit is contained in:
Piero Toffanin 2023-12-11 17:14:27 -05:00
parent 7508dbaa46
commit 310b71c056

View file

@ -78,7 +78,7 @@ def detect_languages(text):
return [{"confidence": l.confidence, "language": l.code} for l in candidates]
def improve_translation_formatting(source, translation, improve_punctuation=True):
def improve_translation_formatting(source, translation, improve_punctuation=True, remove_single_word_duplicates=True):
source = source.strip()
if not len(source):
@ -101,6 +101,21 @@ def improve_translation_formatting(source, translation, improve_punctuation=True
elif translation_last_char in punctuation_chars:
translation = translation[:-1]
# A workaround for certain language models that output
# the single word repeated ad-infinitum (the "salad" bug)
# https://github.com/LibreTranslate/LibreTranslate/issues/46
if remove_single_word_duplicates:
if len(source) < 20 and source.count(" ") == 0 and translation.count(" ") > 0:
bow = translation.split()
count = {}
for word in bow:
count[word] = count.get(word, 0) + 1
for word in count:
if count[word] / len(count) >= 2:
translation = bow[0]
break
if source.islower():
return translation.lower()