From a62cb52f5fbc0cf701e1443d8abf415fc312b12c Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sat, 1 Oct 2022 23:21:58 +0900
Subject: [PATCH 1/5] Add

---
 poetry.lock               | 34 +++++++++++++++++++++++++-
 pyproject.toml            |  1 +
 whispering/schema.py      |  7 ++++++
 whispering/transcriber.py |  5 ++++
 whispering/vad.py         | 50 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 whispering/vad.py

diff --git a/poetry.lock b/poetry.lock
index 4f43d3a..00bc0e5 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -378,6 +378,17 @@ python-versions = ">=3.7.0"
 [package.dependencies]
 typing-extensions = "*"
 
+[[package]]
+name = "torchaudio"
+version = "0.12.1"
+description = "An audio package for PyTorch"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+torch = "1.12.1"
+
 [[package]]
 name = "tqdm"
 version = "4.64.1"
@@ -514,7 +525,7 @@ resolved_reference = "62fe7f1009a534986ac1d32a4aef8c244d029c28"
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.8,<3.11"
-content-hash = "d041d21a202339f405cc37076403f92135ee1f113cdfece5a78c9ee12374be7b"
+content-hash = "75e53434d1d46d54a886ca7a896a2f0ba0072a1848f90d5b6dc46ea2c5b47191"
 
 [metadata.files]
 black = [
@@ -964,6 +975,27 @@ torch = [
     {file = "torch-1.12.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:bfec2843daa654f04fda23ba823af03e7b6f7650a873cdb726752d0e3718dada"},
     {file = "torch-1.12.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:69fe2cae7c39ccadd65a123793d30e0db881f1c1927945519c5c17323131437e"},
 ]
+torchaudio = [
+    {file = "torchaudio-0.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dc138bee06b2305442fc132171f2a01d5f42509eaa21bdf87c3d26a6f4a09fdd"},
+    {file = "torchaudio-0.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1d81f71837d5d5be651e85ca9fa9377ecb4513b0129ddfb025540e1c2406d3e6"},
+    {file = "torchaudio-0.12.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:c2f46ad1332d4eb4c5bc2259bad22f7693d1e81cdcf2ab04242bf428d78f161f"},
+    {file = "torchaudio-0.12.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:21741a277d31f75215a09c1590170055b65c2eceda6aa5a263676745bd97172e"},
+    {file = "torchaudio-0.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:f0cc2d4ab4288d5115fab554a49bed6251469dc1548c961655556ec48a3c320e"},
+    {file = "torchaudio-0.12.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:23dbcf37af2f41d491c0337ca94501ec7ef588adb1766e1eb28033fac549bbd9"},
+    {file = "torchaudio-0.12.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e82c48b05d941d64cc67a18d13f8e76ba7e852fe9f187b47d3abfbebd1f05195"},
+    {file = "torchaudio-0.12.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:142da7f0f05517b32cb54ed6f37997f741ad1bd283474898b680b0dfed7ff926"},
+    {file = "torchaudio-0.12.1-cp37-cp37m-win_amd64.whl", hash = "sha256:1c839ceb2035c3ea3458e274e9a1afb65f5fa41678e76c3378b218eb23956579"},
+    {file = "torchaudio-0.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a4c8c15b1e810a93bb77b27fa49159bea2253b593ef94039946ec49aef51764f"},
+    {file = "torchaudio-0.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83c08b71a6dc8e23c1d7b00780abb9e4c29528e47a6e644fe3dee7ac2263821e"},
+    {file = "torchaudio-0.12.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:186dcaa00b60e441f9c489c00966ecdd7412c2a4592058107f8c3a888cbbf337"},
+    {file = "torchaudio-0.12.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2937756874050cb3249395d7814dacab2c296ce3e5ae3e63397aa4fc902db885"},
+    {file = "torchaudio-0.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:ba00c62bae021b8e5a3d38f04788e489e6f8d9eb16620d8c1e81b1e9d4bf1284"},
+    {file = "torchaudio-0.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:08f92bc53682d3bad8606dedb70a49e5a0f7cf9306c9173f074dbba97785442e"},
+    {file = "torchaudio-0.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2fc5a2bc8e8aad475bc519f3c82b9649e14b5c657487ffa712cf7c514143e9d7"},
+    {file = "torchaudio-0.12.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:075dba92c8c885ef1bc882e24a0ffdcce29a73f4d2377c75d1fa1c76702b37e3"},
+    {file = "torchaudio-0.12.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a2bc09eee50fb5adc3e40c66bb63d525344bb8359f65d9c600d53ea6212207e6"},
+    {file = "torchaudio-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:5b06c72da8ea8f8cd3075d7f97e2866b473aceaca08ef871895cd5fafde078bf"},
+]
 tqdm = [
     {file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"},
     {file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"},
diff --git a/pyproject.toml b/pyproject.toml
index 8dbf250..c508cc0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ sounddevice = "^0.4.5"
 pydantic = "^1.10.2"
 websockets = "^10.3"
 tqdm = "*"
+torchaudio = "^0.12.1"
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/whispering/schema.py b/whispering/schema.py
index 7ebecd4..6611347 100644
--- a/whispering/schema.py
+++ b/whispering/schema.py
@@ -2,6 +2,7 @@
 
 from typing import List, Optional
 
+import numpy as np
 import torch
 from pydantic import BaseModel, root_validator
 
@@ -50,3 +51,9 @@ class ParsedChunk(BaseModel):
     avg_logprob: float
     compression_ratio: float
     no_speech_prob: float
+
+
+class SpeechSegment(BaseModel, arbitrary_types_allowed=True):
+    start_block_idx: int
+    end_block_idx: int
+    segment: np.ndarray
diff --git a/whispering/transcriber.py b/whispering/transcriber.py
index 059d3c4..585842f 100644
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@@ -18,6 +18,7 @@ from whisper.tokenizer import get_tokenizer
 from whisper.utils import exact_div
 
 from whispering.schema import Context, ParsedChunk, WhisperConfig
+from whispering.vad import VAD
 
 logger = getLogger(__name__)
 
@@ -51,6 +52,7 @@ class WhisperStreamingTranscriber:
         self.time_precision: Final[float] = (
             self.input_stride * HOP_LENGTH / SAMPLE_RATE
         )  # time per output token: 0.02 (seconds)
+        self.vad = VAD()
 
     def _get_decoding_options(
         self,
@@ -233,6 +235,9 @@ class WhisperStreamingTranscriber:
         segment: np.ndarray,
         ctx: Context,
     ) -> Iterator[ParsedChunk]:
+        vad_probs = self.vad(segment)
+        logger.debug(f"{vad_probs}")
+
         new_mel = log_mel_spectrogram(audio=segment).unsqueeze(0)
         logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
         if ctx.buffer_mel is None:
diff --git a/whispering/vad.py b/whispering/vad.py
new file mode 100644
index 0000000..c9218a9
--- /dev/null
+++ b/whispering/vad.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+from typing import Iterator
+
+import numpy as np
+import torch
+from whisper.audio import N_FRAMES, SAMPLE_RATE
+
+from whispering.schema import SpeechSegment
+
+
+class VAD:
+    def __init__(
+        self,
+    ):
+        self.vad_model, _ = torch.hub.load(
+            repo_or_dir="snakers4/silero-vad",
+            model="silero_vad",
+        )
+
+    def __call__(
+        self,
+        *,
+        segment: np.ndarray,
+        thredhold: float = 0.5,
+    ) -> Iterator[SpeechBlock]:
+        # segment.shape should be multiple of (N_FRAMES,)
+
+        block_size: int = int(segment.shape[0] / N_FRAMES)
+
+        start_block_idx = None
+        for idx in range(block_size + 1):
+            if idx < block_size:
+            start: int = N_FRAMES * idx
+            end: int = N_FRAMES * (idx + 1)
+            vad_prob = self.vad_model(
+                torch.from_numpy(segment[start:end]),
+                SAMPLE_RATE,
+            ).item()
+            if vad_prob > thredhold:
+                if start_block_idx is None:
+                    start_block_idx = idx
+            else:
+                if start_block_idx is not None:
+                    yield SpeechSegment(
+                        start_block_idx=start_block_idx,
+                        end_block_idx=idx,
+                        segment=segment[N_FRAMES * start_block_idx : N_FRAMES * idx],
+                    )
+                    start_block_idx = None

From 936d5d0c45d27bce3cf7e3e3c0fbbeb781cd9d13 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 2 Oct 2022 19:47:17 +0900
Subject: [PATCH 2/5] Fix

---
 whispering/cli.py         |  8 ++++----
 whispering/schema.py      |  3 ++-
 whispering/transcriber.py |  9 +++++----
 whispering/vad.py         | 11 ++++++-----
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/whispering/cli.py b/whispering/cli.py
index 8b60271..fcbf9e4 100644
--- a/whispering/cli.py
+++ b/whispering/cli.py
@@ -48,16 +48,16 @@ def transcribe_from_mic(
     ):
         idx: int = 0
         while True:
-            logger.debug(f"Segment #: {idx}, The rest of queue: {q.qsize()}")
+            logger.debug(f"Audio #: {idx}, The rest of queue: {q.qsize()}")
 
             if no_progress:
-                segment = q.get()
+                audio = q.get()
             else:
                 pbar_thread = ProgressBar(
                     num_block=num_block,  # TODO: set more accurate value
                 )
                 try:
-                    segment = q.get()
+                    audio = q.get()
                 except KeyboardInterrupt:
                     pbar_thread.kill()
                     return
@@ -68,7 +68,7 @@ def transcribe_from_mic(
                 sys.stderr.write("Analyzing")
                 sys.stderr.flush()
 
-            for chunk in wsp.transcribe(segment=segment, ctx=ctx):
+            for chunk in wsp.transcribe(audio=audio, ctx=ctx):
                 if not no_progress:
                     sys.stderr.write("\r")
                     sys.stderr.flush()
diff --git a/whispering/schema.py b/whispering/schema.py
index 4d7f9af..d31d4df 100644
--- a/whispering/schema.py
+++ b/whispering/schema.py
@@ -2,6 +2,7 @@
 
 from typing import List, Optional
 
+import numpy as np
 import torch
 from pydantic import BaseModel, root_validator
 
@@ -55,4 +56,4 @@ class ParsedChunk(BaseModel):
 class SpeechSegment(BaseModel, arbitrary_types_allowed=True):
     start_block_idx: int
     end_block_idx: int
-    segment: torch.Tensor
+    audio: np.ndarray
diff --git a/whispering/transcriber.py b/whispering/transcriber.py
index ac5d3b9..107d30a 100644
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@@ -3,6 +3,7 @@
 from logging import getLogger
 from typing import Final, Iterator, Optional, Union
 
+import numpy as np
 import torch
 from whisper import Whisper, load_model
 from whisper.audio import (
@@ -226,13 +227,13 @@ class WhisperStreamingTranscriber:
     def transcribe(
         self,
         *,
-        segment: torch.Tensor,
+        audio: np.ndarray,
         ctx: Context,
     ) -> Iterator[ParsedChunk]:
-        for speech_segment in self.vad(segment=segment):
+        for speech_segment in self.vad(audio=audio):
             logger.debug(f"{speech_segment}")
 
-        new_mel = log_mel_spectrogram(audio=segment)
+        new_mel = log_mel_spectrogram(audio=audio)
         logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
         if ctx.buffer_mel is None:
             mel = new_mel
@@ -244,7 +245,7 @@ class WhisperStreamingTranscriber:
 
         seek: int = 0
         while seek < mel.shape[-1]:
-            segment = (
+            segment: torch.Tensor = (
                 pad_or_trim(mel[:, seek:], N_FRAMES)
                 .to(self.model.device)  # type: ignore
                 .to(self.dtype)
diff --git a/whispering/vad.py b/whispering/vad.py
index 8d992de..f740b66 100644
--- a/whispering/vad.py
+++ b/whispering/vad.py
@@ -2,6 +2,7 @@
 
 from typing import Iterator
 
+import numpy as np
 import torch
 from whisper.audio import N_FRAMES, SAMPLE_RATE
 
@@ -20,10 +21,10 @@ class VAD:
     def __call__(
         self,
         *,
-        segment: torch.Tensor,
+        audio: np.ndarray,
         thredhold: float = 0.5,
     ) -> Iterator[SpeechSegment]:
-        # segment.shape should be multiple of (N_FRAMES,)
+        # audio.shape should be multiple of (N_FRAMES,)
 
         def my_ret(
             *,
@@ -33,17 +34,17 @@ class VAD:
             return SpeechSegment(
                 start_block_idx=start_block_idx,
                 end_block_idx=idx,
-                segment=segment[N_FRAMES * start_block_idx : N_FRAMES * idx],
+                audio=audio[N_FRAMES * start_block_idx : N_FRAMES * idx],
             )
 
-        block_size: int = int(segment.shape[0] / N_FRAMES)
+        block_size: int = int(audio.shape[0] / N_FRAMES)
 
         start_block_idx = None
         for idx in range(block_size):
             start: int = N_FRAMES * idx
             end: int = N_FRAMES * (idx + 1)
             vad_prob = self.vad_model(
-                torch.from_numpy(segment[start:end]),
+                torch.from_numpy(audio[start:end]),
                 SAMPLE_RATE,
             ).item()
             if vad_prob > thredhold:

From 847eee58197d754c27e5798236b39838be723441 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 2 Oct 2022 19:48:41 +0900
Subject: [PATCH 3/5] Fix

---
 whispering/serve.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/whispering/serve.py b/whispering/serve.py
index 20d50fb..c0d2e16 100644
--- a/whispering/serve.py
+++ b/whispering/serve.py
@@ -21,7 +21,7 @@ async def serve_with_websocket_main(websocket):
     )
 
     while True:
-        logger.debug(f"Segment #: {idx}")
+        logger.debug(f"Audio #: {idx}")
         try:
             message = await websocket.recv()
         except ConnectionClosedOK:
@@ -32,9 +32,9 @@ async def serve_with_websocket_main(websocket):
             continue
 
         logger.debug(f"Message size: {len(message)}")
-        segment = np.frombuffer(message, dtype=np.float32)
+        audio = np.frombuffer(message, dtype=np.float32)
         for chunk in g_wsp.transcribe(
-            segment=segment,  # type: ignore
+            audio=audio,  # type: ignore
             ctx=ctx,
         ):
             await websocket.send(chunk.json())

From 08798f117a88c64b6b58dbe47ba53014f91432eb Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 2 Oct 2022 20:30:51 +0900
Subject: [PATCH 4/5] Add VAD

---
 whispering/transcriber.py | 16 ++++++++++++++--
 whispering/vad.py         | 15 +++++++++------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/whispering/transcriber.py b/whispering/transcriber.py
index 107d30a..a390b44 100644
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@@ -7,6 +7,7 @@ import numpy as np
 import torch
 from whisper import Whisper, load_model
 from whisper.audio import (
+    CHUNK_LENGTH,
     HOP_LENGTH,
     N_FRAMES,
     SAMPLE_RATE,
@@ -52,6 +53,7 @@ class WhisperStreamingTranscriber:
         self.time_precision: Final[float] = (
             self.input_stride * HOP_LENGTH / SAMPLE_RATE
         )  # time per output token: 0.02 (seconds)
+        self.duration_pre_one_mel: Final[float] = CHUNK_LENGTH / HOP_LENGTH
         self.vad = VAD()
 
     def _get_decoding_options(
@@ -230,8 +232,18 @@ class WhisperStreamingTranscriber:
         audio: np.ndarray,
         ctx: Context,
     ) -> Iterator[ParsedChunk]:
-        for speech_segment in self.vad(audio=audio):
-            logger.debug(f"{speech_segment}")
+        logger.debug(f"{len(audio)}")
+        x = [
+            v
+            for v in self.vad(
+                audio=audio,
+                total_block_number=1,
+            )
+        ]
+        if len(x) == 0:  # No speech
+            logger.debug("No speech")
+            ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
+            return
 
         new_mel = log_mel_spectrogram(audio=audio)
         logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
diff --git a/whispering/vad.py b/whispering/vad.py
index f740b66..815e3bb 100644
--- a/whispering/vad.py
+++ b/whispering/vad.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-from typing import Iterator
+from typing import Iterator, Optional
 
 import numpy as np
 import torch
@@ -23,6 +23,7 @@ class VAD:
         *,
         audio: np.ndarray,
         thredhold: float = 0.5,
+        total_block_number: Optional[int] = None,
     ) -> Iterator[SpeechSegment]:
         # audio.shape should be multiple of (N_FRAMES,)
 
@@ -37,12 +38,14 @@ class VAD:
                 audio=audio[N_FRAMES * start_block_idx : N_FRAMES * idx],
             )
 
-        block_size: int = int(audio.shape[0] / N_FRAMES)
+        if total_block_number is None:
+            total_block_number = int(audio.shape[0] / N_FRAMES)
+        block_unit: int = audio.shape[0] // total_block_number
 
         start_block_idx = None
-        for idx in range(block_size):
-            start: int = N_FRAMES * idx
-            end: int = N_FRAMES * (idx + 1)
+        for idx in range(total_block_number):
+            start: int = block_unit * idx
+            end: int = block_unit * (idx + 1)
             vad_prob = self.vad_model(
                 torch.from_numpy(audio[start:end]),
                 SAMPLE_RATE,
@@ -60,5 +63,5 @@ class VAD:
         if start_block_idx is not None:
             yield my_ret(
                 start_block_idx=start_block_idx,
-                idx=block_size,
+                idx=total_block_number,
             )

From 7f15cfeb394019fde615dafc510e0149c475f0e5 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 2 Oct 2022 20:38:21 +0900
Subject: [PATCH 5/5] Add --no-vad option

---
 README.md                 |  1 +
 whispering/cli.py         |  5 +++++
 whispering/schema.py      |  1 +
 whispering/transcriber.py | 24 +++++++++++++-----------
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 042e410..cd646a5 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ whispering --language en --model tiny
 - ``--no-progress`` disables the progress message
 - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
 - ``--debug`` outputs logs for debug
+- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
 
 ### Parse interval
 
diff --git a/whispering/cli.py b/whispering/cli.py
index fcbf9e4..6e9eca4 100644
--- a/whispering/cli.py
+++ b/whispering/cli.py
@@ -155,6 +155,10 @@ def get_opts() -> argparse.Namespace:
         "--no-progress",
         action="store_true",
     )
+    parser.add_argument(
+        "--no-vad",
+        action="store_true",
+    )
     opts = parser.parse_args()
 
     if opts.beam_size <= 0:
@@ -187,6 +191,7 @@ def get_context(*, opts) -> Context:
         beam_size=opts.beam_size,
         temperatures=opts.temperature,
         allow_padding=opts.allow_padding,
+        vad=not opts.no_vad,
     )
     logger.debug(f"Context: {ctx}")
     return ctx
diff --git a/whispering/schema.py b/whispering/schema.py
index d31d4df..14ef78a 100644
--- a/whispering/schema.py
+++ b/whispering/schema.py
@@ -27,6 +27,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
     timestamp: float = 0.0
     buffer_tokens: List[torch.Tensor] = []
     buffer_mel: Optional[torch.Tensor] = None
+    vad: bool = True
 
     temperatures: List[float]
     allow_padding: bool = False
diff --git a/whispering/transcriber.py b/whispering/transcriber.py
index a390b44..be3039c 100644
--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@@ -233,17 +233,19 @@ class WhisperStreamingTranscriber:
         ctx: Context,
     ) -> Iterator[ParsedChunk]:
         logger.debug(f"{len(audio)}")
-        x = [
-            v
-            for v in self.vad(
-                audio=audio,
-                total_block_number=1,
-            )
-        ]
-        if len(x) == 0:  # No speech
-            logger.debug("No speech")
-            ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
-            return
+
+        if not ctx.vad:
+            x = [
+                v
+                for v in self.vad(
+                    audio=audio,
+                    total_block_number=1,
+                )
+            ]
+            if len(x) == 0:  # No speech
+                logger.debug("No speech")
+                ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
+                return
 
         new_mel = log_mel_spectrogram(audio=audio)
         logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")