Add min-confidence property

2022-04-01 13:24:19 +02:00 · 2022-04-01 13:24:19 +02:00 · 4e30659807
commit 4e30659807
parent 43677b6c09
3 changed files with 74 additions and 9 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,7 +10,7 @@ build = "build.rs"

 [dependencies]
 gst = { package = "gstreamer", version = "0.18" }
-gstreamer-base = "0.18"
+gst-base = { package = "gstreamer-base", version = "0.18" }
 once_cell = "1"
 atomic_refcell = "0.1"
 serde = "1"
@ -21,7 +21,7 @@ tokio = { version = "1.0", features = [ "rt-multi-thread", "time" ] }
 async-tungstenite = { version = "0.17", features = ["tokio", "tokio-runtime", "tokio-native-tls"] }

 [build-dependencies]
-gst-plugin-version-helper = "0.7.3"
+gst-plugin-version-helper = "0.7"

 [lib]
 name = "gstvosk"
--- a/README.md
+++ b/README.md
@ -1,12 +1,48 @@
-Vosk Speech Recognition GStreamer Plugin
-========================================
+# Vosk Speech Recognition GStreamer Plugin

 Transcription of speech using [Vosk Toolkit](https://alphacephei.com/vosk/). Can be used to generate subtitles for
-videos, transcription of audio notes, etc.
+movies, live streams, lectures and interviews.

-Usage
-----
+> Vosk is an offline open source speech recognition toolkit. It enables speech recognition for 20+ languages and
+> dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese,
+> Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto, Hindi.
+> More to come.
+>
+> https://github.com/alphacep/vosk-api
+
+This GStreamer plugin was inspired by the work of [@MathieuDuponchelle](https://github.com/mathieuduponchelle) in the
+[AwsTranscriber](https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/tree/main/net/rusoto#awstranscriber) element.
+
+## Build
+
+Compiling this project will provide a shared library that can be used by your local GStreamer installation.

 ```bash
-GST_DEBUG=1,vosk_transcriber:5 gst-launch-1.0 filesrc location=/Users/rafaelcaricio/astronaut.mkv ! matroskademux name=d d.audio_0 ! decodebin ! audiorate ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=48000,channels=1 ! vosk_transcriber server-address=ws://192.168.178.20:2700 ! fakesink dump=true --gst-plugin-path=/Users/rafaelcaricio/development/gst-plugin-vosk/target/release/
-```
+cargo build --release
+```
+
+The compiled shared library `./target/release/libgstvosk.dylib` must be made loadable to GStreamer. One possible
+solution is to use the argument `--gst-plugin-path=` pointing to the location where the library file is every time you
+run `gst-launch-1.0` command line tool.
+
+
+## Example Usage
+
+This plugin connects via websockets protocol to the [Vosk Server](https://alphacephei.com/vosk/server). The easiest
+way to run the Vosk server is using [Docker](https://docs.docker.com/). You can run the server locally using
+this command:
+
+```bash
+docker run --rm --name vosk-server -d -p 2700:2700 alphacep/kaldi-en:latest
+```
+
+Running the recognition server as a separated process comes with the additional benefit that you don't need to
+install any special software. Plus the voice recognition work load is off your GStreamer pipeline process.
+
+This example will just print out the raw text buffers that are published out by the Vosk transcriber:
+
+```bash
+gst-launch-1.0 \
+  vosk_transcriber name=tc ! fakesink sync=true dump=true \
+  uridecodebin uri=https://studio.blender.org/download-source/d1/d1f3b354a8f741c6afabf305489fa510/d1f3b354a8f741c6afabf305489fa510-1080p.mp4 ! audioconvert ! tc.
+```
--- a/src/transcriber/imp.rs
+++ b/src/transcriber/imp.rs
@ -46,6 +46,8 @@ static RUNTIME: Lazy<runtime::Runtime> = Lazy::new(|| {

 const DEFAULT_LATENCY: gst::ClockTime = gst::ClockTime::from_seconds(30);
 const DEFAULT_SERVER_ADDRESS: &str = "ws://localhost:2700";
+const DEFAULT_MIN_CONFIDENCE_THRESHOLD: f64 = 0.7;
+
 const GRANULARITY: gst::ClockTime = gst::ClockTime::from_mseconds(100);

 #[derive(Debug, Clone)]
@ -55,6 +57,9 @@ struct Settings {

    /// The address of the gRPC server to connect to for transcription.
    server_address: String,
+
+    /// Transcription confidence threshold. Anything below this will be ignored.
+    min_confidence_threshold: f64,
 }

 impl Default for Settings {
@ -62,6 +67,7 @@ impl Default for Settings {
        Settings {
            latency: DEFAULT_LATENCY,
            server_address: DEFAULT_SERVER_ADDRESS.to_string(),
+            min_confidence_threshold: DEFAULT_MIN_CONFIDENCE_THRESHOLD,
        }
    }
 }
@ -291,7 +297,13 @@ impl Transcriber {
        state: &mut State,
        transcription: &Vec<WordInfo>,
    ) {
+        let min_confidence_threshold = self.settings.lock().unwrap().min_confidence_threshold;
        for item in transcription.iter() {
+            // Skip items with a confidence below the threshold
+            if item.confidence < min_confidence_threshold {
+                continue;
+            }
+
            let start_time = gst::ClockTime::from_nseconds((item.start * 1_000_000_000.0) as u64);
            let end_time = gst::ClockTime::from_nseconds((item.end * 1_000_000_000.0) as u64);

@ -950,6 +962,15 @@ impl ObjectImpl for Transcriber {
                    Some(DEFAULT_SERVER_ADDRESS),
                    glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
                ),
+                glib::ParamSpecDouble::new(
+                    "min-confidence",
+                    "Minimum Confidence",
+                    "Transcription minimum confidence threshold. Anything below this will be ignored.",
+                    0.0,
+                    1.0,
+                    DEFAULT_MIN_CONFIDENCE_THRESHOLD,
+                    glib::ParamFlags::READWRITE | gst::PARAM_FLAG_MUTABLE_READY,
+                ),
            ]
        });

@ -982,6 +1003,10 @@ impl ObjectImpl for Transcriber {
                let mut settings = self.settings.lock().unwrap();
                settings.server_address = value.get().expect("type checked upstream")
            }
+            "min-confidence" => {
+                let mut settings = self.settings.lock().unwrap();
+                settings.min_confidence_threshold = value.get().expect("type checked upstream")
+            }
            _ => unimplemented!(),
        }
    }
@ -996,6 +1021,10 @@ impl ObjectImpl for Transcriber {
                let settings = self.settings.lock().unwrap();
                settings.server_address.to_value()
            }
+            "min-confidence" => {
+                let settings = self.settings.lock().unwrap();
+                settings.min_confidence_threshold.to_value()
+            }
            _ => unimplemented!(),
        }
    }