// Copyright (C) 2023 François Laignel // // This Source Code Form is subject to the terms of the Mozilla Public License, v2.0. // If a copy of the MPL was not distributed with this file, You can obtain one at // . // // SPDX-License-Identifier: MPL-2.0 use gst::glib; use gst::subclass::prelude::*; use aws_sdk_translate as aws_translate; use aws_sdk_translate::error::ProvideErrorMetadata; use futures::channel::mpsc; use futures::prelude::*; use std::sync::Arc; use super::imp::TranslateSrcPad; use super::transcribe::TranscriptItem; use super::{TranslationTokenizationMethod, CAT}; const SPAN_START: &str = ""; const SPAN_END: &str = ""; #[derive(Debug)] pub struct TranslatedItem { pub pts: gst::ClockTime, pub duration: gst::ClockTime, pub content: String, } impl From<&TranscriptItem> for TranslatedItem { fn from(transcript_item: &TranscriptItem) -> Self { TranslatedItem { pts: transcript_item.pts, duration: transcript_item.duration, content: transcript_item.content.clone(), } } } pub struct TranslateLoop { pad: glib::subclass::ObjectImplRef, client: aws_translate::Client, input_lang: String, output_lang: String, tokenization_method: TranslationTokenizationMethod, transcript_rx: mpsc::Receiver>>, translate_tx: mpsc::Sender>, } impl TranslateLoop { pub fn new( imp: &super::imp::Transcriber, pad: &TranslateSrcPad, input_lang: &str, output_lang: &str, tokenization_method: TranslationTokenizationMethod, transcript_rx: mpsc::Receiver>>, translate_tx: mpsc::Sender>, ) -> Self { let aws_config = imp.aws_config.lock().unwrap(); let aws_config = aws_config .as_ref() .expect("aws_config must be initialized at this stage"); TranslateLoop { pad: pad.ref_counted(), client: aws_sdk_translate::Client::new(aws_config), input_lang: input_lang.to_string(), output_lang: output_lang.to_string(), tokenization_method, transcript_rx, translate_tx, } } pub async fn check_language(&self) -> Result<(), gst::ErrorMessage> { let language_list = self.client.list_languages().send().await.map_err(|err| { let err = format!( "Failed to call list_languages service: {err}: {}", err.meta() ); gst::info!(CAT, imp: self.pad, "{err}"); gst::error_msg!(gst::LibraryError::Failed, ["{err}"]) })?; let found_output_lang = language_list .languages() .iter() .any(|lang| lang.language_code() == self.output_lang); if !found_output_lang { let err = format!("Unknown output languages: {}", self.output_lang); gst::info!(CAT, imp: self.pad, "{err}"); return Err(gst::error_msg!(gst::LibraryError::Failed, ["{err}"])); } Ok(()) } pub async fn run(mut self) -> Result<(), gst::ErrorMessage> { use TranslationTokenizationMethod as Tokenization; while let Some(transcript_items) = self.transcript_rx.next().await { if transcript_items.is_empty() { continue; } let mut ts_duration_list: Vec<(gst::ClockTime, gst::ClockTime)> = vec![]; let mut content: Vec = vec![]; let mut it = transcript_items.iter().peekable(); while let Some(item) = it.next() { let suffix = match it.peek() { Some(next_item) => { if next_item.is_punctuation { "" } else { " " } } None => "", }; ts_duration_list.push((item.pts, item.duration)); content.push(match self.tokenization_method { Tokenization::None => format!("{}{}", item.content, suffix), Tokenization::SpanBased => { format!("{SPAN_START}{}{SPAN_END}{}", item.content, suffix) } }); } let content: String = content.join(""); gst::debug!(CAT, imp: self.pad, "Translating {content} with {ts_duration_list:?}"); let translated_text = self .client .translate_text() .set_source_language_code(Some(self.input_lang.clone())) .set_target_language_code(Some(self.output_lang.clone())) .set_text(Some(content)) .send() .await .map_err(|err| { let err = format!("Failed to call translation service: {err}: {}", err.meta()); gst::info!(CAT, imp: self.pad, "{err}"); gst::error_msg!(gst::LibraryError::Failed, ["{err}"]) })? .translated_text; gst::debug!(CAT, imp: self.pad, "Got translation {translated_text}"); let translated_items = match self.tokenization_method { Tokenization::None => { // Push translation as a single item let mut ts_duration_iter = ts_duration_list.into_iter().peekable(); let &(first_pts, _) = ts_duration_iter.peek().expect("at least one item"); let (last_pts, last_duration) = ts_duration_iter.last().expect("at least one item"); vec![TranslatedItem { pts: first_pts, duration: last_pts.saturating_sub(first_pts) + last_duration, content: translated_text, }] } Tokenization::SpanBased => span_tokenize_items(&translated_text, ts_duration_list), }; gst::trace!(CAT, imp: self.pad, "Sending {translated_items:?}"); if self.translate_tx.send(translated_items).await.is_err() { gst::info!( CAT, imp: self.pad, "translation chan terminated, exiting translation loop" ); break; } } Ok(()) } } /// Parses translated items from the `translation` `String` using `span` tags. /// /// The `translation` is expected to have been returned by the `Translate` ws. /// It can contain id-less `` and `` tags, matching similar /// id-less tags from the content submitted to the `Translate` ws. /// /// This parser accepts both serial `` as well as nested /// ``. /// /// The parsed items are assigned the ts and duration from `ts_duration_list` /// in their order of appearance. /// /// If more parsed items are found, the last item will concatenate the remaining items. /// /// If less parsed items are found, the last item will be assign the remaining /// duration from the `ts_duration_list`. fn span_tokenize_items( translation: &str, ts_duration_list: impl IntoIterator, ) -> Vec { const SPAN_START_LEN: usize = SPAN_START.len(); const SPAN_END_LEN: usize = SPAN_END.len(); let mut translated_items = vec![]; let mut ts_duration_iter = ts_duration_list.into_iter(); // Content for a translated item let mut content = String::new(); // Alleged span chunk let mut chunk = String::new(); for c in translation.chars() { if content.is_empty() && c.is_whitespace() { // ignore leading whitespaces continue; } if chunk.is_empty() { if c == '<' { // Start an alleged span chunk chunk.push(c); } else { content.push(c); } continue; } chunk.push(c); match chunk.len() { len if len < SPAN_START_LEN => continue, SPAN_START_LEN => { if chunk != SPAN_START { continue; } // Got a } SPAN_END_LEN => { if chunk != SPAN_END { continue; } // Got a } _ => { // Can no longer be a span content.extend(chunk.drain(..)); continue; } } // got a span chunk.clear(); if content.is_empty() { continue; } // Add pending content // assign it the next pts and duration from the input list if let Some((pts, duration)) = ts_duration_iter.next() { translated_items.push(TranslatedItem { pts, duration, content, }); content = String::new(); } else if let Some(last_item) = translated_items.last_mut() { // exhausted available pts and duration // add content to last item if !last_item.content.ends_with(' ') { last_item.content.push(' '); } last_item.content.extend(content.drain(..)); } } content.extend(chunk.drain(..)); if !content.is_empty() { // Add last content if let Some((pts, mut duration)) = ts_duration_iter.next() { if let Some((last_pts, last_duration)) = ts_duration_iter.last() { // Fix remaining duration duration = last_pts.saturating_sub(pts) + last_duration; } translated_items.push(TranslatedItem { pts, duration, content, }); } else if let Some(last_item) = translated_items.last_mut() { // No more pts and duration in the index // Add remaining content to the last item pushed if !last_item.content.ends_with(' ') { last_item.content.push(' '); } last_item.content.push_str(&content); } } else if let Some((last_pts, last_duration)) = ts_duration_iter.last() { if let Some(last_item) = translated_items.last_mut() { // No more content, but need to fix last item's duration last_item.duration = last_pts.saturating_sub(last_item.pts) + last_duration; } } translated_items } #[cfg(test)] mod tests { use super::span_tokenize_items; use gst::prelude::*; #[test] fn serial_spans() { let input = "first second third"; let ts_duration_list = vec![ (0.seconds(), 1.seconds()), (1.seconds(), 2.seconds()), (4.seconds(), 3.seconds()), ]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let first = items.next().unwrap(); assert_eq!(first.pts, 0.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "first"); let second = items.next().unwrap(); assert_eq!(second.pts, 1.seconds()); assert_eq!(second.duration, 2.seconds()); assert_eq!(second.content, "second"); let third = items.next().unwrap(); assert_eq!(third.pts, 4.seconds()); assert_eq!(third.duration, 3.seconds()); assert_eq!(third.content, "third"); assert!(items.next().is_none()); } #[test] fn serial_and_nested_spans() { let input = "first second third fourth"; let ts_duration_list = vec![ (0.seconds(), 1.seconds()), (1.seconds(), 2.seconds()), (3.seconds(), 1.seconds()), (4.seconds(), 2.seconds()), ]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let first = items.next().unwrap(); assert_eq!(first.pts, 0.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "first"); let second = items.next().unwrap(); assert_eq!(second.pts, 1.seconds()); assert_eq!(second.duration, 2.seconds()); assert_eq!(second.content, "second "); let third = items.next().unwrap(); assert_eq!(third.pts, 3.seconds()); assert_eq!(third.duration, 1.seconds()); assert_eq!(third.content, "third"); let fourth = items.next().unwrap(); assert_eq!(fourth.pts, 4.seconds()); assert_eq!(fourth.duration, 2.seconds()); assert_eq!(fourth.content, "fourth"); assert!(items.next().is_none()); } #[test] fn nonspaned_serial_and_nested_spans() { let input = "Initial first second third fourth final"; let ts_duration_list = vec![ (0.seconds(), 1.seconds()), (1.seconds(), 1.seconds()), (2.seconds(), 1.seconds()), (3.seconds(), 1.seconds()), (4.seconds(), 1.seconds()), (5.seconds(), 1.seconds()), ]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let init = items.next().unwrap(); assert_eq!(init.pts, 0.seconds()); assert_eq!(init.duration, 1.seconds()); assert_eq!(init.content, "Initial "); let first = items.next().unwrap(); assert_eq!(first.pts, 1.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "first"); let second = items.next().unwrap(); assert_eq!(second.pts, 2.seconds()); assert_eq!(second.duration, 1.seconds()); assert_eq!(second.content, "second "); let third = items.next().unwrap(); assert_eq!(third.pts, 3.seconds()); assert_eq!(third.duration, 1.seconds()); assert_eq!(third.content, "third"); let fourth = items.next().unwrap(); assert_eq!(fourth.pts, 4.seconds()); assert_eq!(fourth.duration, 1.seconds()); assert_eq!(fourth.content, "fourth"); let final_ = items.next().unwrap(); assert_eq!(final_.pts, 5.seconds()); assert_eq!(final_.duration, 1.seconds()); assert_eq!(final_.content, "final"); assert!(items.next().is_none()); } #[test] fn more_parsed_items() { let input = "first second third fourth"; let ts_duration_list = vec![ (0.seconds(), 1.seconds()), (1.seconds(), 2.seconds()), (4.seconds(), 3.seconds()), ]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let first = items.next().unwrap(); assert_eq!(first.pts, 0.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "first"); let second = items.next().unwrap(); assert_eq!(second.pts, 1.seconds()); assert_eq!(second.duration, 2.seconds()); assert_eq!(second.content, "second"); let third = items.next().unwrap(); assert_eq!(third.pts, 4.seconds()); assert_eq!(third.duration, 3.seconds()); assert_eq!(third.content, "third fourth"); assert!(items.next().is_none()); } #[test] fn more_parsed_items_nonspan_final() { let input = "first second third final"; let ts_duration_list = vec![ (0.seconds(), 1.seconds()), (1.seconds(), 2.seconds()), (4.seconds(), 3.seconds()), ]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let first = items.next().unwrap(); assert_eq!(first.pts, 0.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "first"); let second = items.next().unwrap(); assert_eq!(second.pts, 1.seconds()); assert_eq!(second.duration, 2.seconds()); assert_eq!(second.content, "second"); let third = items.next().unwrap(); assert_eq!(third.pts, 4.seconds()); assert_eq!(third.duration, 3.seconds()); assert_eq!(third.content, "third final"); assert!(items.next().is_none()); } #[test] fn less_parsed_items() { let input = "first second"; let ts_duration_list = vec![ (0.seconds(), 1.seconds()), (1.seconds(), 2.seconds()), (4.seconds(), 3.seconds()), ]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let first = items.next().unwrap(); assert_eq!(first.pts, 0.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "first"); let second = items.next().unwrap(); assert_eq!(second.pts, 1.seconds()); assert_eq!(second.duration, 6.seconds()); assert_eq!(second.content, "second"); assert!(items.next().is_none()); } #[test] fn less_parsed_items_nonspan_final() { let input = "first final"; let ts_duration_list = vec![ (0.seconds(), 1.seconds()), (1.seconds(), 2.seconds()), (4.seconds(), 3.seconds()), ]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let first = items.next().unwrap(); assert_eq!(first.pts, 0.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "first"); let final_ = items.next().unwrap(); assert_eq!(final_.pts, 1.seconds()); assert_eq!(final_.duration, 6.seconds()); assert_eq!(final_.content, "final"); assert!(items.next().is_none()); } #[test] fn utf8_input() { let input = "caractères accentués"; let ts_duration_list = vec![(0.seconds(), 1.seconds())]; let mut items = span_tokenize_items(input, ts_duration_list).into_iter(); let first = items.next().unwrap(); assert_eq!(first.pts, 0.seconds()); assert_eq!(first.duration, 1.seconds()); assert_eq!(first.content, "caractères accentués"); assert!(items.next().is_none()); } }