From 08798f117a88c64b6b58dbe47ba53014f91432eb Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sun, 2 Oct 2022 20:30:51 +0900 Subject: [PATCH] Add VAD --- whispering/transcriber.py | 16 ++++++++++++++-- whispering/vad.py | 15 +++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/whispering/transcriber.py b/whispering/transcriber.py index 107d30a..a390b44 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -7,6 +7,7 @@ import numpy as np import torch from whisper import Whisper, load_model from whisper.audio import ( + CHUNK_LENGTH, HOP_LENGTH, N_FRAMES, SAMPLE_RATE, @@ -52,6 +53,7 @@ class WhisperStreamingTranscriber: self.time_precision: Final[float] = ( self.input_stride * HOP_LENGTH / SAMPLE_RATE ) # time per output token: 0.02 (seconds) + self.duration_pre_one_mel: Final[float] = CHUNK_LENGTH / HOP_LENGTH self.vad = VAD() def _get_decoding_options( @@ -230,8 +232,18 @@ class WhisperStreamingTranscriber: audio: np.ndarray, ctx: Context, ) -> Iterator[ParsedChunk]: - for speech_segment in self.vad(audio=audio): - logger.debug(f"{speech_segment}") + logger.debug(f"{len(audio)}") + x = [ + v + for v in self.vad( + audio=audio, + total_block_number=1, + ) + ] + if len(x) == 0: # No speech + logger.debug("No speech") + ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel + return new_mel = log_mel_spectrogram(audio=audio) logger.debug(f"Incoming new_mel.shape: {new_mel.shape}") diff --git a/whispering/vad.py b/whispering/vad.py index f740b66..815e3bb 100644 --- a/whispering/vad.py +++ b/whispering/vad.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from typing import Iterator +from typing import Iterator, Optional import numpy as np import torch @@ -23,6 +23,7 @@ class VAD: *, audio: np.ndarray, thredhold: float = 0.5, + total_block_number: Optional[int] = None, ) -> Iterator[SpeechSegment]: # audio.shape should be multiple of (N_FRAMES,) @@ -37,12 +38,14 @@ class VAD: audio=audio[N_FRAMES * start_block_idx : N_FRAMES * idx], ) - block_size: int = int(audio.shape[0] / N_FRAMES) + if total_block_number is None: + total_block_number = int(audio.shape[0] / N_FRAMES) + block_unit: int = audio.shape[0] // total_block_number start_block_idx = None - for idx in range(block_size): - start: int = N_FRAMES * idx - end: int = N_FRAMES * (idx + 1) + for idx in range(total_block_number): + start: int = block_unit * idx + end: int = block_unit * (idx + 1) vad_prob = self.vad_model( torch.from_numpy(audio[start:end]), SAMPLE_RATE, @@ -60,5 +63,5 @@ class VAD: if start_block_idx is not None: yield my_ret( start_block_idx=start_block_idx, - idx=block_size, + idx=total_block_number, )