Add min-confidence property

This commit is contained in:
Rafael Caricio 2022-04-01 13:24:19 +02:00
parent 43677b6c09
commit 4e30659807
Signed by: rafaelcaricio
GPG key ID: 3C86DBCE8E93C947
3 changed files with 74 additions and 9 deletions

View file

@ -10,7 +10,7 @@ build = "build.rs"
[dependencies]
gst = { package = "gstreamer", version = "0.18" }
gstreamer-base = "0.18"
gst-base = { package = "gstreamer-base", version = "0.18" }
once_cell = "1"
atomic_refcell = "0.1"
serde = "1"
@ -21,7 +21,7 @@ tokio = { version = "1.0", features = [ "rt-multi-thread", "time" ] }
async-tungstenite = { version = "0.17", features = ["tokio", "tokio-runtime", "tokio-native-tls"] }
[build-dependencies]
gst-plugin-version-helper = "0.7.3"
gst-plugin-version-helper = "0.7"
[lib]
name = "gstvosk"

View file

@ -1,12 +1,48 @@
Vosk Speech Recognition GStreamer Plugin
========================================
# Vosk Speech Recognition GStreamer Plugin
Transcription of speech using [Vosk Toolkit](https://alphacephei.com/vosk/). Can be used to generate subtitles for
videos, transcription of audio notes, etc.
movies, live streams, lectures and interviews.
Usage
-----
> Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and
> dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese,
> Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi.
> More to come.
>
> https://github.com/alphacep/vosk-api
This GStreamer plugin was inspired by the work of [@MathieuDuponchelle](https://github.com/mathieuduponchelle) in the
[AwsTranscriber](https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/tree/main/net/rusoto#awstranscriber) element.
## Build
Compiling this project will provide a shared library that can be used by your local GStreamer installation.
```bash
GST_DEBUG=1,vosk_transcriber:5 gst-launch-1.0 filesrc location=/Users/rafaelcaricio/astronaut.mkv ! matroskademux name=d d.audio_0 ! decodebin ! audiorate ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=48000,channels=1 ! vosk_transcriber server-address=ws://192.168.178.20:2700 ! fakesink dump=true --gst-plugin-path=/Users/rafaelcaricio/development/gst-plugin-vosk/target/release/
```
cargo build --release
```
The compiled shared library `./target/release/libgstvosk.dylib` must be made loadable to GStreamer. One possible
solution is to use the argument `--gst-plugin-path=` pointing to the location where the library file is every time you
run `gst-launch-1.0` command line tool.
## Example Usage
This plugin connects via websockets protocol to the [Vosk Server](https://alphacephei.com/vosk/server). The easiest
way to run the Vosk server is using [Docker](https://docs.docker.com/). You can run the server locally using
this command:
```bash
docker run --rm --name vosk-server -d -p 2700:2700 alphacep/kaldi-en:latest
```
Running the recognition server as a separated process comes with the additional benefit that you don't need to
install any special software. Plus the voice recognition work load is off your GStreamer pipeline process.
This example will just print out the raw text buffers that are published out by the Vosk transcriber:
```bash
gst-launch-1.0 \
vosk_transcriber name=tc ! fakesink sync=true dump=true \
uridecodebin uri=https://studio.blender.org/download-source/d1/d1f3b354a8f741c6afabf305489fa510/d1f3b354a8f741c6afabf305489fa510-1080p.mp4 ! audioconvert ! tc.
```

View file

@ -46,6 +46,8 @@ static RUNTIME: Lazy<runtime::Runtime> = Lazy::new(|| {
const DEFAULT_LATENCY: gst::ClockTime = gst::ClockTime::from_seconds(30);
const DEFAULT_SERVER_ADDRESS: &str = "ws://localhost:2700";
const DEFAULT_MIN_CONFIDENCE_THRESHOLD: f64 = 0.7;
const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100);
#[derive(Debug, Clone)]
@ -55,6 +57,9 @@ struct Settings {
/// The address of the gRPC server to connect to for transcription.
server_address: String,
/// Transcription confidence threshold. Anything below this will be ignored.
min_confidence_threshold: f64,
}
impl Default for Settings {
@ -62,6 +67,7 @@ impl Default for Settings {
Settings {
latency: DEFAULT_LATENCY,
server_address: DEFAULT_SERVER_ADDRESS.to_string(),
min_confidence_threshold: DEFAULT_MIN_CONFIDENCE_THRESHOLD,
}
}
}
@ -291,7 +297,13 @@ impl Transcriber {
state: &mut State,
transcription: &Vec<WordInfo>,
) {
let min_confidence_threshold = self.settings.lock().unwrap().min_confidence_threshold;
for item in transcription.iter() {
// Skip items with a confidence below the threshold
if item.confidence < min_confidence_threshold {
continue;
}
let start_time = gst::ClockTime::from_nseconds((item.start * 1_000_000_000.0) as u64);
let end_time = gst::ClockTime::from_nseconds((item.end * 1_000_000_000.0) as u64);
@ -950,6 +962,15 @@ impl ObjectImpl for Transcriber {
Some(DEFAULT_SERVER_ADDRESS),
glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
),
glib::ParamSpecDouble::new(
"min-confidence",
"Minimum Confidence",
"Transcription minimum confidence threshold. Anything below this will be ignored.",
0.0,
1.0,
DEFAULT_MIN_CONFIDENCE_THRESHOLD,
glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
),
]
});
@ -982,6 +1003,10 @@ impl ObjectImpl for Transcriber {
let mut settings = self.settings.lock().unwrap();
settings.server_address = value.get().expect("type checked upstream")
}
"min-confidence" => {
let mut settings = self.settings.lock().unwrap();
settings.min_confidence_threshold = value.get().expect("type checked upstream")
}
_ => unimplemented!(),
}
}
@ -996,6 +1021,10 @@ impl ObjectImpl for Transcriber {
let settings = self.settings.lock().unwrap();
settings.server_address.to_value()
}
"min-confidence" => {
let settings = self.settings.lock().unwrap();
settings.min_confidence_threshold.to_value()
}
_ => unimplemented!(),
}
}