mirror of
https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs.git
synced 2024-11-25 04:51:26 +00:00
audiornnoise: Attach audio level meta to output buffers
This is useful downstream for processing of audio voice payloads, for instance feeding a speech recognition library such as Whisper. Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/merge_requests/1231>
This commit is contained in:
parent
90e06dc37b
commit
4d9263f932
2 changed files with 36 additions and 12 deletions
|
@ -9,9 +9,9 @@ edition = "2021"
|
||||||
rust-version = "1.70"
|
rust-version = "1.70"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
gst = { package = "gstreamer", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs", features = ["v1_16"] }
|
gst = { package = "gstreamer", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs", features = ["v1_20"] }
|
||||||
gst-base = { package = "gstreamer-base", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs", features = ["v1_16"] }
|
gst-base = { package = "gstreamer-base", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs", features = ["v1_20"] }
|
||||||
gst-audio = { package = "gstreamer-audio", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs", features = ["v1_16"] }
|
gst-audio = { package = "gstreamer-audio", git = "https://gitlab.freedesktop.org/gstreamer/gstreamer-rs", features = ["v1_20"] }
|
||||||
anyhow = "1"
|
anyhow = "1"
|
||||||
byte-slice-cast = "1.0"
|
byte-slice-cast = "1.0"
|
||||||
num-traits = "0.2"
|
num-traits = "0.2"
|
||||||
|
|
|
@ -129,10 +129,13 @@ impl AudioRNNoise {
|
||||||
buffer.set_duration(duration);
|
buffer.set_duration(duration);
|
||||||
buffer.set_pts(pts);
|
buffer.set_pts(pts);
|
||||||
|
|
||||||
|
let (level, has_voice) = {
|
||||||
let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?;
|
let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?;
|
||||||
let out_data = out_map.as_mut_slice_of::<f32>().unwrap();
|
let out_data = out_map.as_mut_slice_of::<f32>().unwrap();
|
||||||
|
self.process(state, &settings, in_data, out_data)
|
||||||
|
};
|
||||||
|
|
||||||
self.process(state, &settings, in_data, out_data);
|
gst_audio::AudioLevelMeta::add(buffer, level, has_voice);
|
||||||
}
|
}
|
||||||
|
|
||||||
self.obj().src_pad().push(buffer)
|
self.obj().src_pad().push(buffer)
|
||||||
|
@ -160,10 +163,13 @@ impl AudioRNNoise {
|
||||||
buffer.set_duration(duration);
|
buffer.set_duration(duration);
|
||||||
buffer.set_pts(pts);
|
buffer.set_pts(pts);
|
||||||
|
|
||||||
|
let (level, has_voice) = {
|
||||||
let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?;
|
let mut out_map = buffer.map_writable().map_err(|_| gst::FlowError::Error)?;
|
||||||
let out_data = out_map.as_mut_slice_of::<f32>().unwrap();
|
let out_data = out_map.as_mut_slice_of::<f32>().unwrap();
|
||||||
|
self.process(state, &settings, in_data, out_data)
|
||||||
|
};
|
||||||
|
|
||||||
self.process(state, &settings, in_data, out_data);
|
gst_audio::AudioLevelMeta::add(buffer, level, has_voice);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(GenerateOutputSuccess::Buffer(buffer))
|
Ok(GenerateOutputSuccess::Buffer(buffer))
|
||||||
|
@ -175,9 +181,10 @@ impl AudioRNNoise {
|
||||||
settings: &Settings,
|
settings: &Settings,
|
||||||
input_plane: &[f32],
|
input_plane: &[f32],
|
||||||
output_plane: &mut [f32],
|
output_plane: &mut [f32],
|
||||||
) {
|
) -> (u8, bool) {
|
||||||
let channels = state.in_info.channels() as usize;
|
let channels = state.in_info.channels() as usize;
|
||||||
let size = FRAME_SIZE * channels;
|
let size = FRAME_SIZE * channels;
|
||||||
|
let mut has_voice = false;
|
||||||
|
|
||||||
for (out_frame, in_frame) in output_plane.chunks_mut(size).zip(input_plane.chunks(size)) {
|
for (out_frame, in_frame) in output_plane.chunks_mut(size).zip(input_plane.chunks(size)) {
|
||||||
for (index, item) in in_frame.iter().enumerate() {
|
for (index, item) in in_frame.iter().enumerate() {
|
||||||
|
@ -207,11 +214,15 @@ impl AudioRNNoise {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
gst::debug!(CAT, imp: self, "Voice activity: {}", vad);
|
gst::trace!(CAT, imp: self, "Voice activity: {}", vad);
|
||||||
|
|
||||||
if vad < settings.vad_threshold {
|
if vad < settings.vad_threshold {
|
||||||
out_frame.fill(0.0);
|
out_frame.fill(0.0);
|
||||||
} else {
|
} else {
|
||||||
|
// Upon voice activity nnoiseless never really reports a 1.0
|
||||||
|
// VAD, so we use a hardcoded value close to 1.0 here.
|
||||||
|
if vad >= 0.98 {
|
||||||
|
has_voice = true;
|
||||||
|
}
|
||||||
for (index, item) in out_frame.iter_mut().enumerate() {
|
for (index, item) in out_frame.iter_mut().enumerate() {
|
||||||
let channel_index = index % channels;
|
let channel_index = index % channels;
|
||||||
let channel_denoiser = &state.denoisers[channel_index];
|
let channel_denoiser = &state.denoisers[channel_index];
|
||||||
|
@ -220,6 +231,19 @@ impl AudioRNNoise {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let rms = output_plane.iter().copied().map(|x| x * x).sum::<f32>();
|
||||||
|
let level = (20.0 * f32::log10(rms + f32::EPSILON)) as u8;
|
||||||
|
|
||||||
|
gst::trace!(
|
||||||
|
CAT,
|
||||||
|
imp: self,
|
||||||
|
"rms: {}, level: {}, has_voice : {} ", rms,
|
||||||
|
level,
|
||||||
|
has_voice
|
||||||
|
);
|
||||||
|
|
||||||
|
(level, has_voice)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue