From 54741b7cc414605c7c026cd19e04168b7c721da6 Mon Sep 17 00:00:00 2001 From: Michiel Konstapel Date: Sat, 10 Dec 2022 21:19:51 +0100 Subject: [PATCH] audiornnoise: add voice detection threshold Add a property "voice-activity-threshold". Frames where the voice detection score from the RNN is below the threshold will be completely muted. Part-of: --- audio/audiofx/src/audiornnoise/imp.rs | 86 +++++++++++++++++++++++---- docs/plugins/gst_plugins_cache.json | 16 +++++ 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/audio/audiofx/src/audiornnoise/imp.rs b/audio/audiofx/src/audiornnoise/imp.rs index d91f4ef4..b804bd05 100644 --- a/audio/audiofx/src/audiornnoise/imp.rs +++ b/audio/audiofx/src/audiornnoise/imp.rs @@ -7,6 +7,8 @@ // // SPDX-License-Identifier: MPL-2.0 +use std::sync::Mutex; + use gst::glib; use gst::prelude::*; use gst::subclass::prelude::*; @@ -31,8 +33,22 @@ static CAT: Lazy = Lazy::new(|| { ) }); +const DEFAULT_VOICE_ACTIVITY_THRESHOLD: f32 = 0.0; const FRAME_SIZE: usize = DenoiseState::FRAME_SIZE; +#[derive(Debug, Clone, Copy)] +struct Settings { + vad_threshold: f32, +} + +impl Default for Settings { + fn default() -> Self { + Settings { + vad_threshold: DEFAULT_VOICE_ACTIVITY_THRESHOLD, + } + } +} + struct ChannelDenoiser { denoiser: Box>, frame_chunk: Box<[f32; FRAME_SIZE]>, @@ -47,6 +63,7 @@ struct State { #[derive(Default)] pub struct AudioRNNoise { + settings: Mutex, state: AtomicRefCell>, } @@ -83,7 +100,7 @@ impl State { self.adapter.available() < (FRAME_SIZE * self.in_info.bpf() as usize) } - fn process(&mut self, input_plane: &[f32], output_plane: &mut [f32]) { + fn process(&mut self, input_plane: &[f32], output_plane: &mut [f32], settings: &Settings) { let channels = self.in_info.channels() as usize; let size = FRAME_SIZE * channels; @@ -104,18 +121,26 @@ impl State { // FIXME: The first chunks coming out of the denoisers contains some // fade-in artifacts. We might want to discard those. + let mut vad: f32 = 0.0; for channel_denoiser in &mut self.denoisers { - channel_denoiser.denoiser.process_frame( - &mut channel_denoiser.out_chunk[..], - &channel_denoiser.frame_chunk[..], + vad = f32::max( + vad, + channel_denoiser.denoiser.process_frame( + &mut channel_denoiser.out_chunk[..], + &channel_denoiser.frame_chunk[..], + ), ); } - for (index, item) in out_frame.iter_mut().enumerate() { - let channel_index = index % channels; - let channel_denoiser = &self.denoisers[channel_index]; - let pos = index / channels; - *item = channel_denoiser.out_chunk[pos] / 32767.0; + if vad < settings.vad_threshold { + out_frame.fill(0.0); + } else { + for (index, item) in out_frame.iter_mut().enumerate() { + let channel_index = index % channels; + let channel_denoiser = &self.denoisers[channel_index]; + let pos = index / channels; + *item = channel_denoiser.out_chunk[pos] / 32767.0; + } } } } @@ -131,6 +156,7 @@ impl AudioRNNoise { return Ok(gst::FlowSuccess::Ok); } + let settings = *self.settings.lock().unwrap(); let mut buffer = gst::Buffer::with_size(available).map_err(|e| { gst::error!(CAT, imp: self, "Failed to allocate buffer at EOS {:?}", e); gst::FlowError::Flushing @@ -151,7 +177,7 @@ impl AudioRNNoise { let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?; let out_data = out_map.as_mut_slice_of::().unwrap(); - state.process(in_data, out_data); + state.process(in_data, out_data, &settings); } self.obj().src_pad().push(buffer) @@ -164,6 +190,7 @@ impl AudioRNNoise { let duration = state.buffer_duration(output_size as _); let pts = state.current_pts(); + let settings = *self.settings.lock().unwrap(); let mut buffer = gst::Buffer::with_size(output_size).map_err(|_| gst::FlowError::Error)?; { @@ -181,7 +208,7 @@ impl AudioRNNoise { let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?; let out_data = out_map.as_mut_slice_of::().unwrap(); - state.process(in_data, out_data); + state.process(in_data, out_data, &settings); } Ok(GenerateOutputSuccess::Buffer(buffer)) @@ -195,7 +222,42 @@ impl ObjectSubclass for AudioRNNoise { type ParentType = gst_base::BaseTransform; } -impl ObjectImpl for AudioRNNoise {} +impl ObjectImpl for AudioRNNoise { + fn properties() -> &'static [glib::ParamSpec] { + static PROPERTIES: Lazy> = Lazy::new(|| { + vec![glib::ParamSpecFloat::builder("voice-activity-threshold") + .nick("Voice activity threshold") + .blurb("Threshold of the voice activity detector below which to mute the output") + .minimum(0.0) + .maximum(1.0) + .default_value(DEFAULT_VOICE_ACTIVITY_THRESHOLD) + .mutable_playing() + .build()] + }); + + PROPERTIES.as_ref() + } + + fn set_property(&self, _id: usize, value: &glib::Value, pspec: &glib::ParamSpec) { + match pspec.name() { + "voice-activity-threshold" => { + let mut settings = self.settings.lock().unwrap(); + settings.vad_threshold = value.get().expect("type checked upstream"); + } + _ => unimplemented!(), + } + } + + fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value { + match pspec.name() { + "voice-activity-threshold" => { + let settings = self.settings.lock().unwrap(); + settings.vad_threshold.to_value() + } + _ => unimplemented!(), + } + } +} impl GstObjectImpl for AudioRNNoise {} diff --git a/docs/plugins/gst_plugins_cache.json b/docs/plugins/gst_plugins_cache.json index c8e51e37..9253303b 100644 --- a/docs/plugins/gst_plugins_cache.json +++ b/docs/plugins/gst_plugins_cache.json @@ -3824,6 +3824,22 @@ "presence": "always" } }, + "properties": { + "voice-activity-threshold": { + "blurb": "Threshold of the voice activity detector below which to mute the output", + "conditionally-available": false, + "construct": false, + "construct-only": false, + "controllable": false, + "default": "0", + "max": "1", + "min": "0", + "mutable": "playing", + "readable": true, + "type": "gfloat", + "writable": true + } + }, "rank": "none" }, "ebur128level": {