net/aws: consolidate apostrophes with next word in span_tokenize_items

As the text to translate in span-based mode looks like:

<span>the</span> <span>year</span>

We may receive back from the service when translating to French:

<span>l'</span> <span>année</span>

Which in turn means we'll push out two items. It makes more sense to
push those as a single item, as this will prevent downstream elements
from inserting a space.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/merge_requests/2126>
This commit is contained in:
Mathieu Duponchelle 2025-03-10 20:14:12 +01:00
parent 644c10f62c
commit 10ed6582a9

View file

@ -15,6 +15,7 @@ use aws_sdk_translate::error::ProvideErrorMetadata;
use futures::channel::mpsc;
use futures::prelude::*;
use std::collections::VecDeque;
use std::sync::Arc;
use super::imp::TranslateSrcPad;
@ -343,7 +344,24 @@ pub fn span_tokenize_items(
}
}
translated_items
let mut consolidated_items: VecDeque<TranslatedItem> = VecDeque::new();
let mut consolidate = false;
for item in translated_items.drain(..) {
if consolidate {
let last_item = consolidated_items.back_mut().unwrap();
last_item.duration = item.pts + item.duration - last_item.pts;
last_item.content += &item.content;
consolidate = false;
continue;
}
if item.content.ends_with("'") || item.content.ends_with("") {
consolidate = true;
}
consolidated_items.push_back(item);
}
consolidated_items.into()
}
#[cfg(test)]