From a62cb52f5fbc0cf701e1443d8abf415fc312b12c Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sat, 1 Oct 2022 23:21:58 +0900 Subject: [PATCH 1/5] Add --- poetry.lock | 34 +++++++++++++++++++++++++- pyproject.toml | 1 + whispering/schema.py | 7 ++++++ whispering/transcriber.py | 5 ++++ whispering/vad.py | 50 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 whispering/vad.py diff --git a/poetry.lock b/poetry.lock index 4f43d3a..00bc0e5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -378,6 +378,17 @@ python-versions = ">=3.7.0" [package.dependencies] typing-extensions = "*" +[[package]] +name = "torchaudio" +version = "0.12.1" +description = "An audio package for PyTorch" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +torch = "1.12.1" + [[package]] name = "tqdm" version = "4.64.1" @@ -514,7 +525,7 @@ resolved_reference = "62fe7f1009a534986ac1d32a4aef8c244d029c28" [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.11" -content-hash = "d041d21a202339f405cc37076403f92135ee1f113cdfece5a78c9ee12374be7b" +content-hash = "75e53434d1d46d54a886ca7a896a2f0ba0072a1848f90d5b6dc46ea2c5b47191" [metadata.files] black = [ @@ -964,6 +975,27 @@ torch = [ {file = "torch-1.12.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:bfec2843daa654f04fda23ba823af03e7b6f7650a873cdb726752d0e3718dada"}, {file = "torch-1.12.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:69fe2cae7c39ccadd65a123793d30e0db881f1c1927945519c5c17323131437e"}, ] +torchaudio = [ + {file = "torchaudio-0.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dc138bee06b2305442fc132171f2a01d5f42509eaa21bdf87c3d26a6f4a09fdd"}, + {file = "torchaudio-0.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1d81f71837d5d5be651e85ca9fa9377ecb4513b0129ddfb025540e1c2406d3e6"}, + {file = "torchaudio-0.12.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:c2f46ad1332d4eb4c5bc2259bad22f7693d1e81cdcf2ab04242bf428d78f161f"}, + {file = "torchaudio-0.12.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:21741a277d31f75215a09c1590170055b65c2eceda6aa5a263676745bd97172e"}, + {file = "torchaudio-0.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:f0cc2d4ab4288d5115fab554a49bed6251469dc1548c961655556ec48a3c320e"}, + {file = "torchaudio-0.12.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:23dbcf37af2f41d491c0337ca94501ec7ef588adb1766e1eb28033fac549bbd9"}, + {file = "torchaudio-0.12.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e82c48b05d941d64cc67a18d13f8e76ba7e852fe9f187b47d3abfbebd1f05195"}, + {file = "torchaudio-0.12.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:142da7f0f05517b32cb54ed6f37997f741ad1bd283474898b680b0dfed7ff926"}, + {file = "torchaudio-0.12.1-cp37-cp37m-win_amd64.whl", hash = "sha256:1c839ceb2035c3ea3458e274e9a1afb65f5fa41678e76c3378b218eb23956579"}, + {file = "torchaudio-0.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a4c8c15b1e810a93bb77b27fa49159bea2253b593ef94039946ec49aef51764f"}, + {file = "torchaudio-0.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83c08b71a6dc8e23c1d7b00780abb9e4c29528e47a6e644fe3dee7ac2263821e"}, + {file = "torchaudio-0.12.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:186dcaa00b60e441f9c489c00966ecdd7412c2a4592058107f8c3a888cbbf337"}, + {file = "torchaudio-0.12.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2937756874050cb3249395d7814dacab2c296ce3e5ae3e63397aa4fc902db885"}, + {file = "torchaudio-0.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:ba00c62bae021b8e5a3d38f04788e489e6f8d9eb16620d8c1e81b1e9d4bf1284"}, + {file = "torchaudio-0.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:08f92bc53682d3bad8606dedb70a49e5a0f7cf9306c9173f074dbba97785442e"}, + {file = "torchaudio-0.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2fc5a2bc8e8aad475bc519f3c82b9649e14b5c657487ffa712cf7c514143e9d7"}, + {file = "torchaudio-0.12.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:075dba92c8c885ef1bc882e24a0ffdcce29a73f4d2377c75d1fa1c76702b37e3"}, + {file = "torchaudio-0.12.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a2bc09eee50fb5adc3e40c66bb63d525344bb8359f65d9c600d53ea6212207e6"}, + {file = "torchaudio-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:5b06c72da8ea8f8cd3075d7f97e2866b473aceaca08ef871895cd5fafde078bf"}, +] tqdm = [ {file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"}, {file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"}, diff --git a/pyproject.toml b/pyproject.toml index 8dbf250..c508cc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ sounddevice = "^0.4.5" pydantic = "^1.10.2" websockets = "^10.3" tqdm = "*" +torchaudio = "^0.12.1" [tool.poetry.group.dev.dependencies] diff --git a/whispering/schema.py b/whispering/schema.py index 7ebecd4..6611347 100644 --- a/whispering/schema.py +++ b/whispering/schema.py @@ -2,6 +2,7 @@ from typing import List, Optional +import numpy as np import torch from pydantic import BaseModel, root_validator @@ -50,3 +51,9 @@ class ParsedChunk(BaseModel): avg_logprob: float compression_ratio: float no_speech_prob: float + + +class SpeechSegment(BaseModel, arbitrary_types_allowed=True): + start_block_idx: int + end_block_idx: int + segment: np.ndarray diff --git a/whispering/transcriber.py b/whispering/transcriber.py index 059d3c4..585842f 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -18,6 +18,7 @@ from whisper.tokenizer import get_tokenizer from whisper.utils import exact_div from whispering.schema import Context, ParsedChunk, WhisperConfig +from whispering.vad import VAD logger = getLogger(__name__) @@ -51,6 +52,7 @@ class WhisperStreamingTranscriber: self.time_precision: Final[float] = ( self.input_stride * HOP_LENGTH / SAMPLE_RATE ) # time per output token: 0.02 (seconds) + self.vad = VAD() def _get_decoding_options( self, @@ -233,6 +235,9 @@ class WhisperStreamingTranscriber: segment: np.ndarray, ctx: Context, ) -> Iterator[ParsedChunk]: + vad_probs = self.vad(segment) + logger.debug(f"{vad_probs}") + new_mel = log_mel_spectrogram(audio=segment).unsqueeze(0) logger.debug(f"Incoming new_mel.shape: {new_mel.shape}") if ctx.buffer_mel is None: diff --git a/whispering/vad.py b/whispering/vad.py new file mode 100644 index 0000000..c9218a9 --- /dev/null +++ b/whispering/vad.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +from typing import Iterator + +import numpy as np +import torch +from whisper.audio import N_FRAMES, SAMPLE_RATE + +from whispering.schema import SpeechSegment + + +class VAD: + def __init__( + self, + ): + self.vad_model, _ = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + ) + + def __call__( + self, + *, + segment: np.ndarray, + thredhold: float = 0.5, + ) -> Iterator[SpeechBlock]: + # segment.shape should be multiple of (N_FRAMES,) + + block_size: int = int(segment.shape[0] / N_FRAMES) + + start_block_idx = None + for idx in range(block_size + 1): + if idx < block_size: + start: int = N_FRAMES * idx + end: int = N_FRAMES * (idx + 1) + vad_prob = self.vad_model( + torch.from_numpy(segment[start:end]), + SAMPLE_RATE, + ).item() + if vad_prob > thredhold: + if start_block_idx is None: + start_block_idx = idx + else: + if start_block_idx is not None: + yield SpeechSegment( + start_block_idx=start_block_idx, + end_block_idx=idx, + segment=segment[N_FRAMES * start_block_idx : N_FRAMES * idx], + ) + start_block_idx = None From 936d5d0c45d27bce3cf7e3e3c0fbbeb781cd9d13 Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sun, 2 Oct 2022 19:47:17 +0900 Subject: [PATCH 2/5] Fix --- whispering/cli.py | 8 ++++---- whispering/schema.py | 3 ++- whispering/transcriber.py | 9 +++++---- whispering/vad.py | 11 ++++++----- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/whispering/cli.py b/whispering/cli.py index 8b60271..fcbf9e4 100644 --- a/whispering/cli.py +++ b/whispering/cli.py @@ -48,16 +48,16 @@ def transcribe_from_mic( ): idx: int = 0 while True: - logger.debug(f"Segment #: {idx}, The rest of queue: {q.qsize()}") + logger.debug(f"Audio #: {idx}, The rest of queue: {q.qsize()}") if no_progress: - segment = q.get() + audio = q.get() else: pbar_thread = ProgressBar( num_block=num_block, # TODO: set more accurate value ) try: - segment = q.get() + audio = q.get() except KeyboardInterrupt: pbar_thread.kill() return @@ -68,7 +68,7 @@ def transcribe_from_mic( sys.stderr.write("Analyzing") sys.stderr.flush() - for chunk in wsp.transcribe(segment=segment, ctx=ctx): + for chunk in wsp.transcribe(audio=audio, ctx=ctx): if not no_progress: sys.stderr.write("\r") sys.stderr.flush() diff --git a/whispering/schema.py b/whispering/schema.py index 4d7f9af..d31d4df 100644 --- a/whispering/schema.py +++ b/whispering/schema.py @@ -2,6 +2,7 @@ from typing import List, Optional +import numpy as np import torch from pydantic import BaseModel, root_validator @@ -55,4 +56,4 @@ class ParsedChunk(BaseModel): class SpeechSegment(BaseModel, arbitrary_types_allowed=True): start_block_idx: int end_block_idx: int - segment: torch.Tensor + audio: np.ndarray diff --git a/whispering/transcriber.py b/whispering/transcriber.py index ac5d3b9..107d30a 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -3,6 +3,7 @@ from logging import getLogger from typing import Final, Iterator, Optional, Union +import numpy as np import torch from whisper import Whisper, load_model from whisper.audio import ( @@ -226,13 +227,13 @@ class WhisperStreamingTranscriber: def transcribe( self, *, - segment: torch.Tensor, + audio: np.ndarray, ctx: Context, ) -> Iterator[ParsedChunk]: - for speech_segment in self.vad(segment=segment): + for speech_segment in self.vad(audio=audio): logger.debug(f"{speech_segment}") - new_mel = log_mel_spectrogram(audio=segment) + new_mel = log_mel_spectrogram(audio=audio) logger.debug(f"Incoming new_mel.shape: {new_mel.shape}") if ctx.buffer_mel is None: mel = new_mel @@ -244,7 +245,7 @@ class WhisperStreamingTranscriber: seek: int = 0 while seek < mel.shape[-1]: - segment = ( + segment: torch.Tensor = ( pad_or_trim(mel[:, seek:], N_FRAMES) .to(self.model.device) # type: ignore .to(self.dtype) diff --git a/whispering/vad.py b/whispering/vad.py index 8d992de..f740b66 100644 --- a/whispering/vad.py +++ b/whispering/vad.py @@ -2,6 +2,7 @@ from typing import Iterator +import numpy as np import torch from whisper.audio import N_FRAMES, SAMPLE_RATE @@ -20,10 +21,10 @@ class VAD: def __call__( self, *, - segment: torch.Tensor, + audio: np.ndarray, thredhold: float = 0.5, ) -> Iterator[SpeechSegment]: - # segment.shape should be multiple of (N_FRAMES,) + # audio.shape should be multiple of (N_FRAMES,) def my_ret( *, @@ -33,17 +34,17 @@ class VAD: return SpeechSegment( start_block_idx=start_block_idx, end_block_idx=idx, - segment=segment[N_FRAMES * start_block_idx : N_FRAMES * idx], + audio=audio[N_FRAMES * start_block_idx : N_FRAMES * idx], ) - block_size: int = int(segment.shape[0] / N_FRAMES) + block_size: int = int(audio.shape[0] / N_FRAMES) start_block_idx = None for idx in range(block_size): start: int = N_FRAMES * idx end: int = N_FRAMES * (idx + 1) vad_prob = self.vad_model( - torch.from_numpy(segment[start:end]), + torch.from_numpy(audio[start:end]), SAMPLE_RATE, ).item() if vad_prob > thredhold: From 847eee58197d754c27e5798236b39838be723441 Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sun, 2 Oct 2022 19:48:41 +0900 Subject: [PATCH 3/5] Fix --- whispering/serve.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/whispering/serve.py b/whispering/serve.py index 20d50fb..c0d2e16 100644 --- a/whispering/serve.py +++ b/whispering/serve.py @@ -21,7 +21,7 @@ async def serve_with_websocket_main(websocket): ) while True: - logger.debug(f"Segment #: {idx}") + logger.debug(f"Audio #: {idx}") try: message = await websocket.recv() except ConnectionClosedOK: @@ -32,9 +32,9 @@ async def serve_with_websocket_main(websocket): continue logger.debug(f"Message size: {len(message)}") - segment = np.frombuffer(message, dtype=np.float32) + audio = np.frombuffer(message, dtype=np.float32) for chunk in g_wsp.transcribe( - segment=segment, # type: ignore + audio=audio, # type: ignore ctx=ctx, ): await websocket.send(chunk.json()) From 08798f117a88c64b6b58dbe47ba53014f91432eb Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sun, 2 Oct 2022 20:30:51 +0900 Subject: [PATCH 4/5] Add VAD --- whispering/transcriber.py | 16 ++++++++++++++-- whispering/vad.py | 15 +++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/whispering/transcriber.py b/whispering/transcriber.py index 107d30a..a390b44 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -7,6 +7,7 @@ import numpy as np import torch from whisper import Whisper, load_model from whisper.audio import ( + CHUNK_LENGTH, HOP_LENGTH, N_FRAMES, SAMPLE_RATE, @@ -52,6 +53,7 @@ class WhisperStreamingTranscriber: self.time_precision: Final[float] = ( self.input_stride * HOP_LENGTH / SAMPLE_RATE ) # time per output token: 0.02 (seconds) + self.duration_pre_one_mel: Final[float] = CHUNK_LENGTH / HOP_LENGTH self.vad = VAD() def _get_decoding_options( @@ -230,8 +232,18 @@ class WhisperStreamingTranscriber: audio: np.ndarray, ctx: Context, ) -> Iterator[ParsedChunk]: - for speech_segment in self.vad(audio=audio): - logger.debug(f"{speech_segment}") + logger.debug(f"{len(audio)}") + x = [ + v + for v in self.vad( + audio=audio, + total_block_number=1, + ) + ] + if len(x) == 0: # No speech + logger.debug("No speech") + ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel + return new_mel = log_mel_spectrogram(audio=audio) logger.debug(f"Incoming new_mel.shape: {new_mel.shape}") diff --git a/whispering/vad.py b/whispering/vad.py index f740b66..815e3bb 100644 --- a/whispering/vad.py +++ b/whispering/vad.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from typing import Iterator +from typing import Iterator, Optional import numpy as np import torch @@ -23,6 +23,7 @@ class VAD: *, audio: np.ndarray, thredhold: float = 0.5, + total_block_number: Optional[int] = None, ) -> Iterator[SpeechSegment]: # audio.shape should be multiple of (N_FRAMES,) @@ -37,12 +38,14 @@ class VAD: audio=audio[N_FRAMES * start_block_idx : N_FRAMES * idx], ) - block_size: int = int(audio.shape[0] / N_FRAMES) + if total_block_number is None: + total_block_number = int(audio.shape[0] / N_FRAMES) + block_unit: int = audio.shape[0] // total_block_number start_block_idx = None - for idx in range(block_size): - start: int = N_FRAMES * idx - end: int = N_FRAMES * (idx + 1) + for idx in range(total_block_number): + start: int = block_unit * idx + end: int = block_unit * (idx + 1) vad_prob = self.vad_model( torch.from_numpy(audio[start:end]), SAMPLE_RATE, @@ -60,5 +63,5 @@ class VAD: if start_block_idx is not None: yield my_ret( start_block_idx=start_block_idx, - idx=block_size, + idx=total_block_number, ) From 7f15cfeb394019fde615dafc510e0149c475f0e5 Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sun, 2 Oct 2022 20:38:21 +0900 Subject: [PATCH 5/5] Add --no-vad option --- README.md | 1 + whispering/cli.py | 5 +++++ whispering/schema.py | 1 + whispering/transcriber.py | 24 +++++++++++++----------- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 042e410..cd646a5 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ whispering --language en --model tiny - ``--no-progress`` disables the progress message - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time - ``--debug`` outputs logs for debug +- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period ### Parse interval diff --git a/whispering/cli.py b/whispering/cli.py index fcbf9e4..6e9eca4 100644 --- a/whispering/cli.py +++ b/whispering/cli.py @@ -155,6 +155,10 @@ def get_opts() -> argparse.Namespace: "--no-progress", action="store_true", ) + parser.add_argument( + "--no-vad", + action="store_true", + ) opts = parser.parse_args() if opts.beam_size <= 0: @@ -187,6 +191,7 @@ def get_context(*, opts) -> Context: beam_size=opts.beam_size, temperatures=opts.temperature, allow_padding=opts.allow_padding, + vad=not opts.no_vad, ) logger.debug(f"Context: {ctx}") return ctx diff --git a/whispering/schema.py b/whispering/schema.py index d31d4df..14ef78a 100644 --- a/whispering/schema.py +++ b/whispering/schema.py @@ -27,6 +27,7 @@ class Context(BaseModel, arbitrary_types_allowed=True): timestamp: float = 0.0 buffer_tokens: List[torch.Tensor] = [] buffer_mel: Optional[torch.Tensor] = None + vad: bool = True temperatures: List[float] allow_padding: bool = False diff --git a/whispering/transcriber.py b/whispering/transcriber.py index a390b44..be3039c 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -233,17 +233,19 @@ class WhisperStreamingTranscriber: ctx: Context, ) -> Iterator[ParsedChunk]: logger.debug(f"{len(audio)}") - x = [ - v - for v in self.vad( - audio=audio, - total_block_number=1, - ) - ] - if len(x) == 0: # No speech - logger.debug("No speech") - ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel - return + + if not ctx.vad: + x = [ + v + for v in self.vad( + audio=audio, + total_block_number=1, + ) + ] + if len(x) == 0: # No speech + logger.debug("No speech") + ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel + return new_mel = log_mel_spectrogram(audio=audio) logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")