audiornnoise: add voice detection threshold

Add a property "voice-activity-threshold". Frames where the voice
detection score from the RNN is below the threshold will be completely
muted.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/merge_requests/1004>
This commit is contained in:
Michiel Konstapel 2022-12-10 21:19:51 +01:00 committed by Sebastian Dröge
parent a077ecba85
commit 54741b7cc4
2 changed files with 90 additions and 12 deletions

View file

@ -7,6 +7,8 @@
// //
// SPDX-License-Identifier: MPL-2.0 // SPDX-License-Identifier: MPL-2.0
use std::sync::Mutex;
use gst::glib; use gst::glib;
use gst::prelude::*; use gst::prelude::*;
use gst::subclass::prelude::*; use gst::subclass::prelude::*;
@ -31,8 +33,22 @@ static CAT: Lazy<gst::DebugCategory> = Lazy::new(|| {
) )
}); });
const DEFAULT_VOICE_ACTIVITY_THRESHOLD: f32 = 0.0;
const FRAME_SIZE: usize = DenoiseState::FRAME_SIZE; const FRAME_SIZE: usize = DenoiseState::FRAME_SIZE;
#[derive(Debug, Clone, Copy)]
struct Settings {
vad_threshold: f32,
}
impl Default for Settings {
fn default() -> Self {
Settings {
vad_threshold: DEFAULT_VOICE_ACTIVITY_THRESHOLD,
}
}
}
struct ChannelDenoiser { struct ChannelDenoiser {
denoiser: Box<DenoiseState<'static>>, denoiser: Box<DenoiseState<'static>>,
frame_chunk: Box<[f32; FRAME_SIZE]>, frame_chunk: Box<[f32; FRAME_SIZE]>,
@ -47,6 +63,7 @@ struct State {
#[derive(Default)] #[derive(Default)]
pub struct AudioRNNoise { pub struct AudioRNNoise {
settings: Mutex<Settings>,
state: AtomicRefCell<Option<State>>, state: AtomicRefCell<Option<State>>,
} }
@ -83,7 +100,7 @@ impl State {
self.adapter.available() < (FRAME_SIZE * self.in_info.bpf() as usize) self.adapter.available() < (FRAME_SIZE * self.in_info.bpf() as usize)
} }
fn process(&mut self, input_plane: &[f32], output_plane: &mut [f32]) { fn process(&mut self, input_plane: &[f32], output_plane: &mut [f32], settings: &Settings) {
let channels = self.in_info.channels() as usize; let channels = self.in_info.channels() as usize;
let size = FRAME_SIZE * channels; let size = FRAME_SIZE * channels;
@ -104,18 +121,26 @@ impl State {
// FIXME: The first chunks coming out of the denoisers contains some // FIXME: The first chunks coming out of the denoisers contains some
// fade-in artifacts. We might want to discard those. // fade-in artifacts. We might want to discard those.
let mut vad: f32 = 0.0;
for channel_denoiser in &mut self.denoisers { for channel_denoiser in &mut self.denoisers {
channel_denoiser.denoiser.process_frame( vad = f32::max(
&mut channel_denoiser.out_chunk[..], vad,
&channel_denoiser.frame_chunk[..], channel_denoiser.denoiser.process_frame(
&mut channel_denoiser.out_chunk[..],
&channel_denoiser.frame_chunk[..],
),
); );
} }
for (index, item) in out_frame.iter_mut().enumerate() { if vad < settings.vad_threshold {
let channel_index = index % channels; out_frame.fill(0.0);
let channel_denoiser = &self.denoisers[channel_index]; } else {
let pos = index / channels; for (index, item) in out_frame.iter_mut().enumerate() {
*item = channel_denoiser.out_chunk[pos] / 32767.0; let channel_index = index % channels;
let channel_denoiser = &self.denoisers[channel_index];
let pos = index / channels;
*item = channel_denoiser.out_chunk[pos] / 32767.0;
}
} }
} }
} }
@ -131,6 +156,7 @@ impl AudioRNNoise {
return Ok(gst::FlowSuccess::Ok); return Ok(gst::FlowSuccess::Ok);
} }
let settings = *self.settings.lock().unwrap();
let mut buffer = gst::Buffer::with_size(available).map_err(|e| { let mut buffer = gst::Buffer::with_size(available).map_err(|e| {
gst::error!(CAT, imp: self, "Failed to allocate buffer at EOS {:?}", e); gst::error!(CAT, imp: self, "Failed to allocate buffer at EOS {:?}", e);
gst::FlowError::Flushing gst::FlowError::Flushing
@ -151,7 +177,7 @@ impl AudioRNNoise {
let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?; let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?;
let out_data = out_map.as_mut_slice_of::<f32>().unwrap(); let out_data = out_map.as_mut_slice_of::<f32>().unwrap();
state.process(in_data, out_data); state.process(in_data, out_data, &settings);
} }
self.obj().src_pad().push(buffer) self.obj().src_pad().push(buffer)
@ -164,6 +190,7 @@ impl AudioRNNoise {
let duration = state.buffer_duration(output_size as _); let duration = state.buffer_duration(output_size as _);
let pts = state.current_pts(); let pts = state.current_pts();
let settings = *self.settings.lock().unwrap();
let mut buffer = gst::Buffer::with_size(output_size).map_err(|_| gst::FlowError::Error)?; let mut buffer = gst::Buffer::with_size(output_size).map_err(|_| gst::FlowError::Error)?;
{ {
@ -181,7 +208,7 @@ impl AudioRNNoise {
let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?; let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?;
let out_data = out_map.as_mut_slice_of::<f32>().unwrap(); let out_data = out_map.as_mut_slice_of::<f32>().unwrap();
state.process(in_data, out_data); state.process(in_data, out_data, &settings);
} }
Ok(GenerateOutputSuccess::Buffer(buffer)) Ok(GenerateOutputSuccess::Buffer(buffer))
@ -195,7 +222,42 @@ impl ObjectSubclass for AudioRNNoise {
type ParentType = gst_base::BaseTransform; type ParentType = gst_base::BaseTransform;
} }
impl ObjectImpl for AudioRNNoise {} impl ObjectImpl for AudioRNNoise {
fn properties() -> &'static [glib::ParamSpec] {
static PROPERTIES: Lazy<Vec<glib::ParamSpec>> = Lazy::new(|| {
vec![glib::ParamSpecFloat::builder("voice-activity-threshold")
.nick("Voice activity threshold")
.blurb("Threshold of the voice activity detector below which to mute the output")
.minimum(0.0)
.maximum(1.0)
.default_value(DEFAULT_VOICE_ACTIVITY_THRESHOLD)
.mutable_playing()
.build()]
});
PROPERTIES.as_ref()
}
fn set_property(&self, _id: usize, value: &glib::Value, pspec: &glib::ParamSpec) {
match pspec.name() {
"voice-activity-threshold" => {
let mut settings = self.settings.lock().unwrap();
settings.vad_threshold = value.get().expect("type checked upstream");
}
_ => unimplemented!(),
}
}
fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value {
match pspec.name() {
"voice-activity-threshold" => {
let settings = self.settings.lock().unwrap();
settings.vad_threshold.to_value()
}
_ => unimplemented!(),
}
}
}
impl GstObjectImpl for AudioRNNoise {} impl GstObjectImpl for AudioRNNoise {}

View file

@ -3824,6 +3824,22 @@
"presence": "always" "presence": "always"
} }
}, },
"properties": {
"voice-activity-threshold": {
"blurb": "Threshold of the voice activity detector below which to mute the output",
"conditionally-available": false,
"construct": false,
"construct-only": false,
"controllable": false,
"default": "0",
"max": "1",
"min": "0",
"mutable": "playing",
"readable": true,
"type": "gfloat",
"writable": true
}
},
"rank": "none" "rank": "none"
}, },
"ebur128level": { "ebur128level": {