From 86f38c6ca91a2bc9ff54836906f734ea7ceae502 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sat, 15 Oct 2022 13:33:39 +0900
Subject: [PATCH] Removed --no-vad option and --vad option to set threshold

---
 README.md                 | 2 +-
 tests/test_cli.py         | 1 -
 whispering/cli.py         | 9 +++++----
 whispering/schema.py      | 3 +--
 whispering/transcriber.py | 2 +-
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 2a3c9ed..c14c41a 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ whispering --language en --model tiny
 - ``--no-progress`` disables the progress message
 - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
 - ``--debug`` outputs logs for debug
-- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
+- ``--vad`` set VAD (Voice Activity Detection) threshold. 0 disables VAD and forces whisper to analyze non-voice activity sound period
 - ``--output`` sets output file (Default: Standard output)
 
 ### Parse interval
diff --git a/tests/test_cli.py b/tests/test_cli.py
index a0702be..a63c7d8 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -15,7 +15,6 @@ def test_options():
         "--mode server --beam_size 3",
         "--mode server --temperature 0",
         "--mode server --allow-padding",
-        "--mode server --no-vad",
         "--mode server --num_block 3",
         "--mode mic --host 0.0.0.0",
         "--mode mic --port 8000",
diff --git a/whispering/cli.py b/whispering/cli.py
index 1148751..884d999 100644
--- a/whispering/cli.py
+++ b/whispering/cli.py
@@ -149,8 +149,10 @@ def get_opts() -> argparse.Namespace:
         action="store_true",
     )
     group_ctx.add_argument(
-        "--no-vad",
-        action="store_true",
+        "--vad",
+        type=float,
+        help="Threshold of VAD",
+        default=0.5,
     )
 
     group_misc = parser.add_argument_group("Other options")
@@ -223,7 +225,7 @@ def get_context(*, opts) -> Context:
         beam_size=opts.beam_size,
         temperatures=opts.temperature,
         allow_padding=opts.allow_padding,
-        vad=not opts.no_vad,
+        vad_threshold=opts.vad,
     )
     logger.debug(f"Context: {ctx}")
     return ctx
@@ -244,7 +246,6 @@ def is_valid_arg(opts) -> bool:
             "beam_size",
             "temperature",
             "allow_padding",
-            "no-vad",
         ]
     elif opts.mode == Mode.mic.value:
         keys = [
diff --git a/whispering/schema.py b/whispering/schema.py
index ee0272f..e8b4e97 100644
--- a/whispering/schema.py
+++ b/whispering/schema.py
@@ -32,7 +32,6 @@ class Context(BaseModel, arbitrary_types_allowed=True):
     timestamp: float = 0.0
     buffer_tokens: List[torch.Tensor] = []
     buffer_mel: Optional[torch.Tensor] = None
-    vad: bool = True
 
     temperatures: List[float]
     allow_padding: bool = False
@@ -46,7 +45,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
     logprob_threshold: Optional[float] = -1.0
     compression_ratio_threshold: Optional[float] = 2.4
     buffer_threshold: Optional[float] = 0.5
-    vad_threshold: float = 0.5
+    vad_threshold: float
 
 
 class ParsedChunk(BaseModel):
diff --git a/whispering/transcriber.py b/whispering/transcriber.py
index 878db24..45f5ddc 100644
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@@ -234,7 +234,7 @@ class WhisperStreamingTranscriber:
     ) -> Iterator[ParsedChunk]:
         logger.debug(f"{len(audio)}")
 
-        if ctx.vad:
+        if ctx.vad_threshold > 0.0:
             x = [
                 v
                 for v in self.vad(