Add min-confidence property
This commit is contained in:
parent
43677b6c09
commit
4e30659807
3 changed files with 74 additions and 9 deletions
|
@ -10,7 +10,7 @@ build = "build.rs"
|
|||
|
||||
[dependencies]
|
||||
gst = { package = "gstreamer", version = "0.18" }
|
||||
gstreamer-base = "0.18"
|
||||
gst-base = { package = "gstreamer-base", version = "0.18" }
|
||||
once_cell = "1"
|
||||
atomic_refcell = "0.1"
|
||||
serde = "1"
|
||||
|
@ -21,7 +21,7 @@ tokio = { version = "1.0", features = [ "rt-multi-thread", "time" ] }
|
|||
async-tungstenite = { version = "0.17", features = ["tokio", "tokio-runtime", "tokio-native-tls"] }
|
||||
|
||||
[build-dependencies]
|
||||
gst-plugin-version-helper = "0.7.3"
|
||||
gst-plugin-version-helper = "0.7"
|
||||
|
||||
[lib]
|
||||
name = "gstvosk"
|
||||
|
|
48
README.md
48
README.md
|
@ -1,12 +1,48 @@
|
|||
Vosk Speech Recognition GStreamer Plugin
|
||||
========================================
|
||||
# Vosk Speech Recognition GStreamer Plugin
|
||||
|
||||
Transcription of speech using [Vosk Toolkit](https://alphacephei.com/vosk/). Can be used to generate subtitles for
|
||||
videos, transcription of audio notes, etc.
|
||||
movies, live streams, lectures and interviews.
|
||||
|
||||
Usage
|
||||
-----
|
||||
> Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and
|
||||
> dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese,
|
||||
> Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi.
|
||||
> More to come.
|
||||
>
|
||||
> https://github.com/alphacep/vosk-api
|
||||
|
||||
This GStreamer plugin was inspired by the work of [@MathieuDuponchelle](https://github.com/mathieuduponchelle) in the
|
||||
[AwsTranscriber](https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/tree/main/net/rusoto#awstranscriber) element.
|
||||
|
||||
## Build
|
||||
|
||||
Compiling this project will provide a shared library that can be used by your local GStreamer installation.
|
||||
|
||||
```bash
|
||||
GST_DEBUG=1,vosk_transcriber:5 gst-launch-1.0 filesrc location=/Users/rafaelcaricio/astronaut.mkv ! matroskademux name=d d.audio_0 ! decodebin ! audiorate ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=48000,channels=1 ! vosk_transcriber server-address=ws://192.168.178.20:2700 ! fakesink dump=true --gst-plugin-path=/Users/rafaelcaricio/development/gst-plugin-vosk/target/release/
|
||||
cargo build --release
|
||||
```
|
||||
|
||||
The compiled shared library `./target/release/libgstvosk.dylib` must be made loadable to GStreamer. One possible
|
||||
solution is to use the argument `--gst-plugin-path=` pointing to the location where the library file is every time you
|
||||
run `gst-launch-1.0` command line tool.
|
||||
|
||||
|
||||
## Example Usage
|
||||
|
||||
This plugin connects via websockets protocol to the [Vosk Server](https://alphacephei.com/vosk/server). The easiest
|
||||
way to run the Vosk server is using [Docker](https://docs.docker.com/). You can run the server locally using
|
||||
this command:
|
||||
|
||||
```bash
|
||||
docker run --rm --name vosk-server -d -p 2700:2700 alphacep/kaldi-en:latest
|
||||
```
|
||||
|
||||
Running the recognition server as a separated process comes with the additional benefit that you don't need to
|
||||
install any special software. Plus the voice recognition work load is off your GStreamer pipeline process.
|
||||
|
||||
This example will just print out the raw text buffers that are published out by the Vosk transcriber:
|
||||
|
||||
```bash
|
||||
gst-launch-1.0 \
|
||||
vosk_transcriber name=tc ! fakesink sync=true dump=true \
|
||||
uridecodebin uri=https://studio.blender.org/download-source/d1/d1f3b354a8f741c6afabf305489fa510/d1f3b354a8f741c6afabf305489fa510-1080p.mp4 ! audioconvert ! tc.
|
||||
```
|
|
@ -46,6 +46,8 @@ static RUNTIME: Lazy<runtime::Runtime> = Lazy::new(|| {
|
|||
|
||||
const DEFAULT_LATENCY: gst::ClockTime = gst::ClockTime::from_seconds(30);
|
||||
const DEFAULT_SERVER_ADDRESS: &str = "ws://localhost:2700";
|
||||
const DEFAULT_MIN_CONFIDENCE_THRESHOLD: f64 = 0.7;
|
||||
|
||||
const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100);
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
|
@ -55,6 +57,9 @@ struct Settings {
|
|||
|
||||
/// The address of the gRPC server to connect to for transcription.
|
||||
server_address: String,
|
||||
|
||||
/// Transcription confidence threshold. Anything below this will be ignored.
|
||||
min_confidence_threshold: f64,
|
||||
}
|
||||
|
||||
impl Default for Settings {
|
||||
|
@ -62,6 +67,7 @@ impl Default for Settings {
|
|||
Settings {
|
||||
latency: DEFAULT_LATENCY,
|
||||
server_address: DEFAULT_SERVER_ADDRESS.to_string(),
|
||||
min_confidence_threshold: DEFAULT_MIN_CONFIDENCE_THRESHOLD,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -291,7 +297,13 @@ impl Transcriber {
|
|||
state: &mut State,
|
||||
transcription: &Vec<WordInfo>,
|
||||
) {
|
||||
let min_confidence_threshold = self.settings.lock().unwrap().min_confidence_threshold;
|
||||
for item in transcription.iter() {
|
||||
// Skip items with a confidence below the threshold
|
||||
if item.confidence < min_confidence_threshold {
|
||||
continue;
|
||||
}
|
||||
|
||||
let start_time = gst::ClockTime::from_nseconds((item.start * 1_000_000_000.0) as u64);
|
||||
let end_time = gst::ClockTime::from_nseconds((item.end * 1_000_000_000.0) as u64);
|
||||
|
||||
|
@ -950,6 +962,15 @@ impl ObjectImpl for Transcriber {
|
|||
Some(DEFAULT_SERVER_ADDRESS),
|
||||
glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
|
||||
),
|
||||
glib::ParamSpecDouble::new(
|
||||
"min-confidence",
|
||||
"Minimum Confidence",
|
||||
"Transcription minimum confidence threshold. Anything below this will be ignored.",
|
||||
0.0,
|
||||
1.0,
|
||||
DEFAULT_MIN_CONFIDENCE_THRESHOLD,
|
||||
glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
|
||||
),
|
||||
]
|
||||
});
|
||||
|
||||
|
@ -982,6 +1003,10 @@ impl ObjectImpl for Transcriber {
|
|||
let mut settings = self.settings.lock().unwrap();
|
||||
settings.server_address = value.get().expect("type checked upstream")
|
||||
}
|
||||
"min-confidence" => {
|
||||
let mut settings = self.settings.lock().unwrap();
|
||||
settings.min_confidence_threshold = value.get().expect("type checked upstream")
|
||||
}
|
||||
_ => unimplemented!(),
|
||||
}
|
||||
}
|
||||
|
@ -996,6 +1021,10 @@ impl ObjectImpl for Transcriber {
|
|||
let settings = self.settings.lock().unwrap();
|
||||
settings.server_address.to_value()
|
||||
}
|
||||
"min-confidence" => {
|
||||
let settings = self.settings.lock().unwrap();
|
||||
settings.min_confidence_threshold.to_value()
|
||||
}
|
||||
_ => unimplemented!(),
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue