diff --git a/whispering/schema.py b/whispering/schema.py index 14ef78a..2e65c72 100644 --- a/whispering/schema.py +++ b/whispering/schema.py @@ -41,6 +41,7 @@ class Context(BaseModel, arbitrary_types_allowed=True): logprob_threshold: Optional[float] = -1.0 compression_ratio_threshold: Optional[float] = 2.4 buffer_threshold: Optional[float] = 0.5 + vad_threshold: float = 0.5 class ParsedChunk(BaseModel): diff --git a/whispering/transcriber.py b/whispering/transcriber.py index be3039c..701c505 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -240,6 +240,7 @@ class WhisperStreamingTranscriber: for v in self.vad( audio=audio, total_block_number=1, + threshold=ctx.vad_threshold, ) ] if len(x) == 0: # No speech diff --git a/whispering/vad.py b/whispering/vad.py index 28ec341..17ec68f 100644 --- a/whispering/vad.py +++ b/whispering/vad.py @@ -22,7 +22,7 @@ class VAD: self, *, audio: np.ndarray, - threshold: float = 0.5, + threshold: float, total_block_number: Optional[int] = None, ) -> Iterator[SpeechSegment]: # audio.shape should be multiple of (N_FRAMES,)