Add --no-vad option

2024-06-02 21:39:28 +00:00 · 2022-10-02 20:38:21 +09:00 · 2022-10-02 20:38:21 +09:00 · 7f15cfeb39
parent 08798f117a
commit 7f15cfeb39
4 changed files with 20 additions and 11 deletions
--- a/README.md
+++ b/README.md
@ -34,6 +34,7 @@ whispering --language en --model tiny
 - ``--no-progress`` disables the progress message
 - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
 - ``--debug`` outputs logs for debug
+- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period

 ### Parse interval

--- a/whispering/cli.py
+++ b/whispering/cli.py
@ -155,6 +155,10 @@ def get_opts() -> argparse.Namespace:
        "--no-progress",
        action="store_true",
    )
+    parser.add_argument(
+        "--no-vad",
+        action="store_true",
+    )
    opts = parser.parse_args()

    if opts.beam_size <= 0:
@ -187,6 +191,7 @@ def get_context(*, opts) -> Context:
        beam_size=opts.beam_size,
        temperatures=opts.temperature,
        allow_padding=opts.allow_padding,
+        vad=not opts.no_vad,
    )
    logger.debug(f"Context: {ctx}")
    return ctx
--- a/whispering/schema.py
+++ b/whispering/schema.py
@ -27,6 +27,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
    timestamp: float = 0.0
    buffer_tokens: List[torch.Tensor] = []
    buffer_mel: Optional[torch.Tensor] = None
+    vad: bool = True

    temperatures: List[float]
    allow_padding: bool = False
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@ -233,6 +233,8 @@ class WhisperStreamingTranscriber:
        ctx: Context,
    ) -> Iterator[ParsedChunk]:
        logger.debug(f"{len(audio)}")
+
+        if not ctx.vad:
            x = [
                v
                for v in self.vad(