From 644c10f62cb28292e7a79021324d8301cae7d6e8 Mon Sep 17 00:00:00 2001 From: Mathieu Duponchelle Date: Mon, 10 Mar 2025 19:25:31 +0100 Subject: [PATCH] net/aws: don't insert space when joining leftover punctuation Part-of: --- net/aws/src/transcriber/translate.rs | 48 +++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/net/aws/src/transcriber/translate.rs b/net/aws/src/transcriber/translate.rs index 712a7404a..5c1a15d03 100644 --- a/net/aws/src/transcriber/translate.rs +++ b/net/aws/src/transcriber/translate.rs @@ -300,10 +300,15 @@ pub fn span_tokenize_items( } else if let Some(last_item) = translated_items.last_mut() { // exhausted available pts and duration // add content to last item - if !last_item.content.ends_with(' ') { + let starts_with_punctuation = content.starts_with(|c: char| c.is_ascii_punctuation()); + + if !starts_with_punctuation { last_item.content.push(' '); } - last_item.content.extend(content.drain(..)); + + last_item.content.push_str(content.trim()); + + content = String::new(); } } @@ -320,15 +325,16 @@ pub fn span_tokenize_items( translated_items.push(TranslatedItem { pts, duration, - content, + content: content.trim().to_string(), }); } else if let Some(last_item) = translated_items.last_mut() { // No more pts and duration in the index // Add remaining content to the last item pushed - if !last_item.content.ends_with(' ') { + let starts_with_punctuation = content.starts_with(|c: char| c.is_ascii_punctuation()); + if !starts_with_punctuation { last_item.content.push(' '); } - last_item.content.push_str(&content); + last_item.content.push_str(content.trim()); } } else if let Some((last_pts, last_duration)) = ts_duration_iter.last() { if let Some(last_item) = translated_items.last_mut() { @@ -576,4 +582,36 @@ mod tests { assert!(items.next().is_none()); } + + #[test] + fn exhausted_spans_join_punctuation() { + let input = "et les Clippers sont au tableau, et c'est Norman qui attaque en lisant Max Christie."; + + let ts_duration_list = vec![ + (0.seconds(), 1.seconds()), + (1.seconds(), 1.seconds()), + (2.seconds(), 1.seconds()), + (3.seconds(), 1.seconds()), + (4.seconds(), 1.seconds()), + (5.seconds(), 1.seconds()), + (6.seconds(), 1.seconds()), + (7.seconds(), 1.seconds()), + (8.seconds(), 1.seconds()), + (9.seconds(), 1.seconds()), + (10.seconds(), 1.seconds()), + (11.seconds(), 1.seconds()), + (12.seconds(), 1.seconds()), + (13.seconds(), 1.seconds()), + (14.seconds(), 1.seconds()), + (15.seconds(), 1.seconds()), + ]; + + let items = span_tokenize_items(input, ts_duration_list).into_iter(); + + let final_ = items.last().unwrap(); + + // when all spans are consumed and punctuation remains as the content, + // don't join it with a space with the last item content (Christie .) + assert!(final_.content == "Christie."); + } }