Add min-confidence property
This commit is contained in:
parent
43677b6c09
commit
4e30659807
3 changed files with 74 additions and 9 deletions
|
@ -10,7 +10,7 @@ build = "build.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
gst = { package = "gstreamer", version = "0.18" }
|
gst = { package = "gstreamer", version = "0.18" }
|
||||||
gstreamer-base = "0.18"
|
gst-base = { package = "gstreamer-base", version = "0.18" }
|
||||||
once_cell = "1"
|
once_cell = "1"
|
||||||
atomic_refcell = "0.1"
|
atomic_refcell = "0.1"
|
||||||
serde = "1"
|
serde = "1"
|
||||||
|
@ -21,7 +21,7 @@ tokio = { version = "1.0", features = [ "rt-multi-thread", "time" ] }
|
||||||
async-tungstenite = { version = "0.17", features = ["tokio", "tokio-runtime", "tokio-native-tls"] }
|
async-tungstenite = { version = "0.17", features = ["tokio", "tokio-runtime", "tokio-native-tls"] }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
gst-plugin-version-helper = "0.7.3"
|
gst-plugin-version-helper = "0.7"
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
name = "gstvosk"
|
name = "gstvosk"
|
||||||
|
|
48
README.md
48
README.md
|
@ -1,12 +1,48 @@
|
||||||
Vosk Speech Recognition GStreamer Plugin
|
# Vosk Speech Recognition GStreamer Plugin
|
||||||
========================================
|
|
||||||
|
|
||||||
Transcription of speech using [Vosk Toolkit](https://alphacephei.com/vosk/). Can be used to generate subtitles for
|
Transcription of speech using [Vosk Toolkit](https://alphacephei.com/vosk/). Can be used to generate subtitles for
|
||||||
videos, transcription of audio notes, etc.
|
movies, live streams, lectures and interviews.
|
||||||
|
|
||||||
Usage
|
> Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and
|
||||||
-----
|
> dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese,
|
||||||
|
> Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi.
|
||||||
|
> More to come.
|
||||||
|
>
|
||||||
|
> https://github.com/alphacep/vosk-api
|
||||||
|
|
||||||
|
This GStreamer plugin was inspired by the work of [@MathieuDuponchelle](https://github.com/mathieuduponchelle) in the
|
||||||
|
[AwsTranscriber](https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/tree/main/net/rusoto#awstranscriber) element.
|
||||||
|
|
||||||
|
## Build
|
||||||
|
|
||||||
|
Compiling this project will provide a shared library that can be used by your local GStreamer installation.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
GST_DEBUG=1,vosk_transcriber:5 gst-launch-1.0 filesrc location=/Users/rafaelcaricio/astronaut.mkv ! matroskademux name=d d.audio_0 ! decodebin ! audiorate ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=48000,channels=1 ! vosk_transcriber server-address=ws://192.168.178.20:2700 ! fakesink dump=true --gst-plugin-path=/Users/rafaelcaricio/development/gst-plugin-vosk/target/release/
|
cargo build --release
|
||||||
|
```
|
||||||
|
|
||||||
|
The compiled shared library `./target/release/libgstvosk.dylib` must be made loadable to GStreamer. One possible
|
||||||
|
solution is to use the argument `--gst-plugin-path=` pointing to the location where the library file is every time you
|
||||||
|
run `gst-launch-1.0` command line tool.
|
||||||
|
|
||||||
|
|
||||||
|
## Example Usage
|
||||||
|
|
||||||
|
This plugin connects via websockets protocol to the [Vosk Server](https://alphacephei.com/vosk/server). The easiest
|
||||||
|
way to run the Vosk server is using [Docker](https://docs.docker.com/). You can run the server locally using
|
||||||
|
this command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --rm --name vosk-server -d -p 2700:2700 alphacep/kaldi-en:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
Running the recognition server as a separated process comes with the additional benefit that you don't need to
|
||||||
|
install any special software. Plus the voice recognition work load is off your GStreamer pipeline process.
|
||||||
|
|
||||||
|
This example will just print out the raw text buffers that are published out by the Vosk transcriber:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
gst-launch-1.0 \
|
||||||
|
vosk_transcriber name=tc ! fakesink sync=true dump=true \
|
||||||
|
uridecodebin uri=https://studio.blender.org/download-source/d1/d1f3b354a8f741c6afabf305489fa510/d1f3b354a8f741c6afabf305489fa510-1080p.mp4 ! audioconvert ! tc.
|
||||||
```
|
```
|
|
@ -46,6 +46,8 @@ static RUNTIME: Lazy<runtime::Runtime> = Lazy::new(|| {
|
||||||
|
|
||||||
const DEFAULT_LATENCY: gst::ClockTime = gst::ClockTime::from_seconds(30);
|
const DEFAULT_LATENCY: gst::ClockTime = gst::ClockTime::from_seconds(30);
|
||||||
const DEFAULT_SERVER_ADDRESS: &str = "ws://localhost:2700";
|
const DEFAULT_SERVER_ADDRESS: &str = "ws://localhost:2700";
|
||||||
|
const DEFAULT_MIN_CONFIDENCE_THRESHOLD: f64 = 0.7;
|
||||||
|
|
||||||
const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100);
|
const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100);
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
|
@ -55,6 +57,9 @@ struct Settings {
|
||||||
|
|
||||||
/// The address of the gRPC server to connect to for transcription.
|
/// The address of the gRPC server to connect to for transcription.
|
||||||
server_address: String,
|
server_address: String,
|
||||||
|
|
||||||
|
/// Transcription confidence threshold. Anything below this will be ignored.
|
||||||
|
min_confidence_threshold: f64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Settings {
|
impl Default for Settings {
|
||||||
|
@ -62,6 +67,7 @@ impl Default for Settings {
|
||||||
Settings {
|
Settings {
|
||||||
latency: DEFAULT_LATENCY,
|
latency: DEFAULT_LATENCY,
|
||||||
server_address: DEFAULT_SERVER_ADDRESS.to_string(),
|
server_address: DEFAULT_SERVER_ADDRESS.to_string(),
|
||||||
|
min_confidence_threshold: DEFAULT_MIN_CONFIDENCE_THRESHOLD,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -291,7 +297,13 @@ impl Transcriber {
|
||||||
state: &mut State,
|
state: &mut State,
|
||||||
transcription: &Vec<WordInfo>,
|
transcription: &Vec<WordInfo>,
|
||||||
) {
|
) {
|
||||||
|
let min_confidence_threshold = self.settings.lock().unwrap().min_confidence_threshold;
|
||||||
for item in transcription.iter() {
|
for item in transcription.iter() {
|
||||||
|
// Skip items with a confidence below the threshold
|
||||||
|
if item.confidence < min_confidence_threshold {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let start_time = gst::ClockTime::from_nseconds((item.start * 1_000_000_000.0) as u64);
|
let start_time = gst::ClockTime::from_nseconds((item.start * 1_000_000_000.0) as u64);
|
||||||
let end_time = gst::ClockTime::from_nseconds((item.end * 1_000_000_000.0) as u64);
|
let end_time = gst::ClockTime::from_nseconds((item.end * 1_000_000_000.0) as u64);
|
||||||
|
|
||||||
|
@ -950,6 +962,15 @@ impl ObjectImpl for Transcriber {
|
||||||
Some(DEFAULT_SERVER_ADDRESS),
|
Some(DEFAULT_SERVER_ADDRESS),
|
||||||
glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
|
glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
|
||||||
),
|
),
|
||||||
|
glib::ParamSpecDouble::new(
|
||||||
|
"min-confidence",
|
||||||
|
"Minimum Confidence",
|
||||||
|
"Transcription minimum confidence threshold. Anything below this will be ignored.",
|
||||||
|
0.0,
|
||||||
|
1.0,
|
||||||
|
DEFAULT_MIN_CONFIDENCE_THRESHOLD,
|
||||||
|
glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
|
||||||
|
),
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -982,6 +1003,10 @@ impl ObjectImpl for Transcriber {
|
||||||
let mut settings = self.settings.lock().unwrap();
|
let mut settings = self.settings.lock().unwrap();
|
||||||
settings.server_address = value.get().expect("type checked upstream")
|
settings.server_address = value.get().expect("type checked upstream")
|
||||||
}
|
}
|
||||||
|
"min-confidence" => {
|
||||||
|
let mut settings = self.settings.lock().unwrap();
|
||||||
|
settings.min_confidence_threshold = value.get().expect("type checked upstream")
|
||||||
|
}
|
||||||
_ => unimplemented!(),
|
_ => unimplemented!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -996,6 +1021,10 @@ impl ObjectImpl for Transcriber {
|
||||||
let settings = self.settings.lock().unwrap();
|
let settings = self.settings.lock().unwrap();
|
||||||
settings.server_address.to_value()
|
settings.server_address.to_value()
|
||||||
}
|
}
|
||||||
|
"min-confidence" => {
|
||||||
|
let settings = self.settings.lock().unwrap();
|
||||||
|
settings.min_confidence_threshold.to_value()
|
||||||
|
}
|
||||||
_ => unimplemented!(),
|
_ => unimplemented!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue