Add --no-vad option

This commit is contained in:
Yuta Hayashibe 2022-10-02 20:38:21 +09:00
parent 08798f117a
commit 7f15cfeb39
4 changed files with 20 additions and 11 deletions

View file

@ -34,6 +34,7 @@ whispering --language en --model tiny
- ``--no-progress`` disables the progress message - ``--no-progress`` disables the progress message
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
- ``--debug`` outputs logs for debug - ``--debug`` outputs logs for debug
- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
### Parse interval ### Parse interval

View file

@ -155,6 +155,10 @@ def get_opts() -> argparse.Namespace:
"--no-progress", "--no-progress",
action="store_true", action="store_true",
) )
parser.add_argument(
"--no-vad",
action="store_true",
)
opts = parser.parse_args() opts = parser.parse_args()
if opts.beam_size <= 0: if opts.beam_size <= 0:
@ -187,6 +191,7 @@ def get_context(*, opts) -> Context:
beam_size=opts.beam_size, beam_size=opts.beam_size,
temperatures=opts.temperature, temperatures=opts.temperature,
allow_padding=opts.allow_padding, allow_padding=opts.allow_padding,
vad=not opts.no_vad,
) )
logger.debug(f"Context: {ctx}") logger.debug(f"Context: {ctx}")
return ctx return ctx

View file

@ -27,6 +27,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
timestamp: float = 0.0 timestamp: float = 0.0
buffer_tokens: List[torch.Tensor] = [] buffer_tokens: List[torch.Tensor] = []
buffer_mel: Optional[torch.Tensor] = None buffer_mel: Optional[torch.Tensor] = None
vad: bool = True
temperatures: List[float] temperatures: List[float]
allow_padding: bool = False allow_padding: bool = False

View file

@ -233,17 +233,19 @@ class WhisperStreamingTranscriber:
ctx: Context, ctx: Context,
) -> Iterator[ParsedChunk]: ) -> Iterator[ParsedChunk]:
logger.debug(f"{len(audio)}") logger.debug(f"{len(audio)}")
x = [
v if not ctx.vad:
for v in self.vad( x = [
audio=audio, v
total_block_number=1, for v in self.vad(
) audio=audio,
] total_block_number=1,
if len(x) == 0: # No speech )
logger.debug("No speech") ]
ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel if len(x) == 0: # No speech
return logger.debug("No speech")
ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
return
new_mel = log_mel_spectrogram(audio=audio) new_mel = log_mel_spectrogram(audio=audio)
logger.debug(f"Incoming new_mel.shape: {new_mel.shape}") logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")