From f5bd88ab6bb379c669083e38d239508c99842b98 Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sun, 2 Oct 2022 20:41:53 +0900 Subject: [PATCH] Add vad_threshold to Context --- whispering/schema.py | 1 + whispering/transcriber.py | 1 + whispering/vad.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/whispering/schema.py b/whispering/schema.py index 14ef78a..2e65c72 100644 --- a/whispering/schema.py +++ b/whispering/schema.py @@ -41,6 +41,7 @@ class Context(BaseModel, arbitrary_types_allowed=True): logprob_threshold: Optional[float] = -1.0 compression_ratio_threshold: Optional[float] = 2.4 buffer_threshold: Optional[float] = 0.5 + vad_threshold: float = 0.5 class ParsedChunk(BaseModel): diff --git a/whispering/transcriber.py b/whispering/transcriber.py index be3039c..701c505 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -240,6 +240,7 @@ class WhisperStreamingTranscriber: for v in self.vad( audio=audio, total_block_number=1, + threshold=ctx.vad_threshold, ) ] if len(x) == 0: # No speech diff --git a/whispering/vad.py b/whispering/vad.py index 28ec341..17ec68f 100644 --- a/whispering/vad.py +++ b/whispering/vad.py @@ -22,7 +22,7 @@ class VAD: self, *, audio: np.ndarray, - threshold: float = 0.5, + threshold: float, total_block_number: Optional[int] = None, ) -> Iterator[SpeechSegment]: # audio.shape should be multiple of (N_FRAMES,)