This commit is contained in:
Yuta Hayashibe 2022-10-01 23:21:58 +09:00
parent bcbf1f7c8b
commit a62cb52f5f
5 changed files with 96 additions and 1 deletions

34
poetry.lock generated
View file

@ -378,6 +378,17 @@ python-versions = ">=3.7.0"
[package.dependencies]
typing-extensions = "*"
[[package]]
name = "torchaudio"
version = "0.12.1"
description = "An audio package for PyTorch"
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
torch = "1.12.1"
[[package]]
name = "tqdm"
version = "4.64.1"
@ -514,7 +525,7 @@ resolved_reference = "62fe7f1009a534986ac1d32a4aef8c244d029c28"
[metadata]
lock-version = "1.1"
python-versions = ">=3.8,<3.11"
content-hash = "d041d21a202339f405cc37076403f92135ee1f113cdfece5a78c9ee12374be7b"
content-hash = "75e53434d1d46d54a886ca7a896a2f0ba0072a1848f90d5b6dc46ea2c5b47191"
[metadata.files]
black = [
@ -964,6 +975,27 @@ torch = [
{file = "torch-1.12.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:bfec2843daa654f04fda23ba823af03e7b6f7650a873cdb726752d0e3718dada"},
{file = "torch-1.12.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:69fe2cae7c39ccadd65a123793d30e0db881f1c1927945519c5c17323131437e"},
]
torchaudio = [
{file = "torchaudio-0.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dc138bee06b2305442fc132171f2a01d5f42509eaa21bdf87c3d26a6f4a09fdd"},
{file = "torchaudio-0.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1d81f71837d5d5be651e85ca9fa9377ecb4513b0129ddfb025540e1c2406d3e6"},
{file = "torchaudio-0.12.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:c2f46ad1332d4eb4c5bc2259bad22f7693d1e81cdcf2ab04242bf428d78f161f"},
{file = "torchaudio-0.12.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:21741a277d31f75215a09c1590170055b65c2eceda6aa5a263676745bd97172e"},
{file = "torchaudio-0.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:f0cc2d4ab4288d5115fab554a49bed6251469dc1548c961655556ec48a3c320e"},
{file = "torchaudio-0.12.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:23dbcf37af2f41d491c0337ca94501ec7ef588adb1766e1eb28033fac549bbd9"},
{file = "torchaudio-0.12.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e82c48b05d941d64cc67a18d13f8e76ba7e852fe9f187b47d3abfbebd1f05195"},
{file = "torchaudio-0.12.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:142da7f0f05517b32cb54ed6f37997f741ad1bd283474898b680b0dfed7ff926"},
{file = "torchaudio-0.12.1-cp37-cp37m-win_amd64.whl", hash = "sha256:1c839ceb2035c3ea3458e274e9a1afb65f5fa41678e76c3378b218eb23956579"},
{file = "torchaudio-0.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a4c8c15b1e810a93bb77b27fa49159bea2253b593ef94039946ec49aef51764f"},
{file = "torchaudio-0.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83c08b71a6dc8e23c1d7b00780abb9e4c29528e47a6e644fe3dee7ac2263821e"},
{file = "torchaudio-0.12.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:186dcaa00b60e441f9c489c00966ecdd7412c2a4592058107f8c3a888cbbf337"},
{file = "torchaudio-0.12.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2937756874050cb3249395d7814dacab2c296ce3e5ae3e63397aa4fc902db885"},
{file = "torchaudio-0.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:ba00c62bae021b8e5a3d38f04788e489e6f8d9eb16620d8c1e81b1e9d4bf1284"},
{file = "torchaudio-0.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:08f92bc53682d3bad8606dedb70a49e5a0f7cf9306c9173f074dbba97785442e"},
{file = "torchaudio-0.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2fc5a2bc8e8aad475bc519f3c82b9649e14b5c657487ffa712cf7c514143e9d7"},
{file = "torchaudio-0.12.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:075dba92c8c885ef1bc882e24a0ffdcce29a73f4d2377c75d1fa1c76702b37e3"},
{file = "torchaudio-0.12.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a2bc09eee50fb5adc3e40c66bb63d525344bb8359f65d9c600d53ea6212207e6"},
{file = "torchaudio-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:5b06c72da8ea8f8cd3075d7f97e2866b473aceaca08ef871895cd5fafde078bf"},
]
tqdm = [
{file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"},
{file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"},

View file

@ -13,6 +13,7 @@ sounddevice = "^0.4.5"
pydantic = "^1.10.2"
websockets = "^10.3"
tqdm = "*"
torchaudio = "^0.12.1"
[tool.poetry.group.dev.dependencies]

View file

@ -2,6 +2,7 @@
from typing import List, Optional
import numpy as np
import torch
from pydantic import BaseModel, root_validator
@ -50,3 +51,9 @@ class ParsedChunk(BaseModel):
avg_logprob: float
compression_ratio: float
no_speech_prob: float
class SpeechSegment(BaseModel, arbitrary_types_allowed=True):
start_block_idx: int
end_block_idx: int
segment: np.ndarray

View file

@ -18,6 +18,7 @@ from whisper.tokenizer import get_tokenizer
from whisper.utils import exact_div
from whispering.schema import Context, ParsedChunk, WhisperConfig
from whispering.vad import VAD
logger = getLogger(__name__)
@ -51,6 +52,7 @@ class WhisperStreamingTranscriber:
self.time_precision: Final[float] = (
self.input_stride * HOP_LENGTH / SAMPLE_RATE
) # time per output token: 0.02 (seconds)
self.vad = VAD()
def _get_decoding_options(
self,
@ -233,6 +235,9 @@ class WhisperStreamingTranscriber:
segment: np.ndarray,
ctx: Context,
) -> Iterator[ParsedChunk]:
vad_probs = self.vad(segment)
logger.debug(f"{vad_probs}")
new_mel = log_mel_spectrogram(audio=segment).unsqueeze(0)
logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
if ctx.buffer_mel is None:

50
whispering/vad.py Normal file
View file

@ -0,0 +1,50 @@
#!/usr/bin/env python3
from typing import Iterator
import numpy as np
import torch
from whisper.audio import N_FRAMES, SAMPLE_RATE
from whispering.schema import SpeechSegment
class VAD:
def __init__(
self,
):
self.vad_model, _ = torch.hub.load(
repo_or_dir="snakers4/silero-vad",
model="silero_vad",
)
def __call__(
self,
*,
segment: np.ndarray,
thredhold: float = 0.5,
) -> Iterator[SpeechBlock]:
# segment.shape should be multiple of (N_FRAMES,)
block_size: int = int(segment.shape[0] / N_FRAMES)
start_block_idx = None
for idx in range(block_size + 1):
if idx < block_size:
start: int = N_FRAMES * idx
end: int = N_FRAMES * (idx + 1)
vad_prob = self.vad_model(
torch.from_numpy(segment[start:end]),
SAMPLE_RATE,
).item()
if vad_prob > thredhold:
if start_block_idx is None:
start_block_idx = idx
else:
if start_block_idx is not None:
yield SpeechSegment(
start_block_idx=start_block_idx,
end_block_idx=idx,
segment=segment[N_FRAMES * start_block_idx : N_FRAMES * idx],
)
start_block_idx = None