Removed --no-vad option and --vad option to set threshold

2024-11-22 08:51:01 +00:00 · 2022-10-15 13:33:39 +09:00 · 2022-10-15 13:33:39 +09:00 · 86f38c6ca9
commit 86f38c6ca9
parent dce9719fea
5 changed files with 8 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -41,7 +41,7 @@ whispering --language en --model tiny
 - ``--no-progress`` disables the progress message
 - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
 - ``--debug`` outputs logs for debug
- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
+- ``--vad`` set VAD (Voice Activity Detection) threshold. 0 disables VAD and forces whisper to analyze non-voice activity sound period
 - ``--output`` sets output file (Default: Standard output)

 ### Parse interval
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -15,7 +15,6 @@ def test_options():
        "--mode server --beam_size 3",
        "--mode server --temperature 0",
        "--mode server --allow-padding",
-        "--mode server --no-vad",
        "--mode server --num_block 3",
        "--mode mic --host 0.0.0.0",
        "--mode mic --port 8000",
--- a/whispering/cli.py
+++ b/whispering/cli.py
@ -149,8 +149,10 @@ def get_opts() -> argparse.Namespace:
        action="store_true",
    )
    group_ctx.add_argument(
-        "--no-vad",
-        action="store_true",
+        "--vad",
+        type=float,
+        help="Threshold of VAD",
+        default=0.5,
    )

    group_misc = parser.add_argument_group("Other options")
@ -223,7 +225,7 @@ def get_context(*, opts) -> Context:
        beam_size=opts.beam_size,
        temperatures=opts.temperature,
        allow_padding=opts.allow_padding,
-        vad=not opts.no_vad,
+        vad_threshold=opts.vad,
    )
    logger.debug(f"Context: {ctx}")
    return ctx
@ -244,7 +246,6 @@ def is_valid_arg(opts) -> bool:
            "beam_size",
            "temperature",
            "allow_padding",
-            "no-vad",
        ]
    elif opts.mode == Mode.mic.value:
        keys = [
--- a/whispering/schema.py
+++ b/whispering/schema.py
@ -32,7 +32,6 @@ class Context(BaseModel, arbitrary_types_allowed=True):
    timestamp: float = 0.0
    buffer_tokens: List[torch.Tensor] = []
    buffer_mel: Optional[torch.Tensor] = None
-    vad: bool = True

    temperatures: List[float]
    allow_padding: bool = False
@ -46,7 +45,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
    logprob_threshold: Optional[float] = -1.0
    compression_ratio_threshold: Optional[float] = 2.4
    buffer_threshold: Optional[float] = 0.5
-    vad_threshold: float = 0.5
+    vad_threshold: float


 class ParsedChunk(BaseModel):
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@ -234,7 +234,7 @@ class WhisperStreamingTranscriber:
    ) -> Iterator[ParsedChunk]:
        logger.debug(f"{len(audio)}")

-        if ctx.vad:
+        if ctx.vad_threshold > 0.0:
            x = [
                v
                for v in self.vad(