From f5bd88ab6bb379c669083e38d239508c99842b98 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 2 Oct 2022 20:41:53 +0900
Subject: [PATCH] Add vad_threshold to Context

---
 whispering/schema.py      | 1 +
 whispering/transcriber.py | 1 +
 whispering/vad.py         | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/whispering/schema.py b/whispering/schema.py
index 14ef78a..2e65c72 100644
--- a/whispering/schema.py
+++ b/whispering/schema.py
@@ -41,6 +41,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
     logprob_threshold: Optional[float] = -1.0
     compression_ratio_threshold: Optional[float] = 2.4
     buffer_threshold: Optional[float] = 0.5
+    vad_threshold: float = 0.5
 
 
 class ParsedChunk(BaseModel):
diff --git a/whispering/transcriber.py b/whispering/transcriber.py
index be3039c..701c505 100644
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@@ -240,6 +240,7 @@ class WhisperStreamingTranscriber:
                 for v in self.vad(
                     audio=audio,
                     total_block_number=1,
+                    threshold=ctx.vad_threshold,
                 )
             ]
             if len(x) == 0:  # No speech
diff --git a/whispering/vad.py b/whispering/vad.py
index 28ec341..17ec68f 100644
--- a/whispering/vad.py
+++ b/whispering/vad.py
@@ -22,7 +22,7 @@ class VAD:
         self,
         *,
         audio: np.ndarray,
-        threshold: float = 0.5,
+        threshold: float,
         total_block_number: Optional[int] = None,
     ) -> Iterator[SpeechSegment]:
         # audio.shape should be multiple of (N_FRAMES,)