2022-10-01 14:21:58 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
from typing import Iterator
|
|
|
|
|
2022-10-02 10:47:17 +00:00
|
|
|
import numpy as np
|
2022-10-01 14:21:58 +00:00
|
|
|
import torch
|
|
|
|
from whisper.audio import N_FRAMES, SAMPLE_RATE
|
|
|
|
|
|
|
|
from whispering.schema import SpeechSegment
|
|
|
|
|
|
|
|
|
|
|
|
class VAD:
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
):
|
|
|
|
self.vad_model, _ = torch.hub.load(
|
|
|
|
repo_or_dir="snakers4/silero-vad",
|
|
|
|
model="silero_vad",
|
|
|
|
)
|
|
|
|
|
|
|
|
def __call__(
|
|
|
|
self,
|
|
|
|
*,
|
2022-10-02 10:47:17 +00:00
|
|
|
audio: np.ndarray,
|
2022-10-01 14:21:58 +00:00
|
|
|
thredhold: float = 0.5,
|
2022-10-02 10:39:33 +00:00
|
|
|
) -> Iterator[SpeechSegment]:
|
2022-10-02 10:47:17 +00:00
|
|
|
# audio.shape should be multiple of (N_FRAMES,)
|
2022-10-01 14:21:58 +00:00
|
|
|
|
2022-10-02 10:39:33 +00:00
|
|
|
def my_ret(
|
|
|
|
*,
|
|
|
|
start_block_idx: int,
|
|
|
|
idx: int,
|
|
|
|
) -> SpeechSegment:
|
|
|
|
return SpeechSegment(
|
|
|
|
start_block_idx=start_block_idx,
|
|
|
|
end_block_idx=idx,
|
2022-10-02 10:47:17 +00:00
|
|
|
audio=audio[N_FRAMES * start_block_idx : N_FRAMES * idx],
|
2022-10-02 10:39:33 +00:00
|
|
|
)
|
|
|
|
|
2022-10-02 10:47:17 +00:00
|
|
|
block_size: int = int(audio.shape[0] / N_FRAMES)
|
2022-10-01 14:21:58 +00:00
|
|
|
|
|
|
|
start_block_idx = None
|
2022-10-02 10:39:33 +00:00
|
|
|
for idx in range(block_size):
|
2022-10-01 14:21:58 +00:00
|
|
|
start: int = N_FRAMES * idx
|
|
|
|
end: int = N_FRAMES * (idx + 1)
|
|
|
|
vad_prob = self.vad_model(
|
2022-10-02 10:47:17 +00:00
|
|
|
torch.from_numpy(audio[start:end]),
|
2022-10-01 14:21:58 +00:00
|
|
|
SAMPLE_RATE,
|
|
|
|
).item()
|
|
|
|
if vad_prob > thredhold:
|
|
|
|
if start_block_idx is None:
|
|
|
|
start_block_idx = idx
|
|
|
|
else:
|
|
|
|
if start_block_idx is not None:
|
2022-10-02 10:39:33 +00:00
|
|
|
yield my_ret(
|
2022-10-01 14:21:58 +00:00
|
|
|
start_block_idx=start_block_idx,
|
2022-10-02 10:39:33 +00:00
|
|
|
idx=idx,
|
2022-10-01 14:21:58 +00:00
|
|
|
)
|
|
|
|
start_block_idx = None
|
2022-10-02 10:39:33 +00:00
|
|
|
if start_block_idx is not None:
|
|
|
|
yield my_ret(
|
|
|
|
start_block_idx=start_block_idx,
|
|
|
|
idx=block_size,
|
|
|
|
)
|