From 86f38c6ca91a2bc9ff54836906f734ea7ceae502 Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sat, 15 Oct 2022 13:33:39 +0900 Subject: [PATCH] Removed --no-vad option and --vad option to set threshold --- README.md | 2 +- tests/test_cli.py | 1 - whispering/cli.py | 9 +++++---- whispering/schema.py | 3 +-- whispering/transcriber.py | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2a3c9ed..c14c41a 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ whispering --language en --model tiny - ``--no-progress`` disables the progress message - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time - ``--debug`` outputs logs for debug -- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period +- ``--vad`` set VAD (Voice Activity Detection) threshold. 0 disables VAD and forces whisper to analyze non-voice activity sound period - ``--output`` sets output file (Default: Standard output) ### Parse interval diff --git a/tests/test_cli.py b/tests/test_cli.py index a0702be..a63c7d8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -15,7 +15,6 @@ def test_options(): "--mode server --beam_size 3", "--mode server --temperature 0", "--mode server --allow-padding", - "--mode server --no-vad", "--mode server --num_block 3", "--mode mic --host 0.0.0.0", "--mode mic --port 8000", diff --git a/whispering/cli.py b/whispering/cli.py index 1148751..884d999 100644 --- a/whispering/cli.py +++ b/whispering/cli.py @@ -149,8 +149,10 @@ def get_opts() -> argparse.Namespace: action="store_true", ) group_ctx.add_argument( - "--no-vad", - action="store_true", + "--vad", + type=float, + help="Threshold of VAD", + default=0.5, ) group_misc = parser.add_argument_group("Other options") @@ -223,7 +225,7 @@ def get_context(*, opts) -> Context: beam_size=opts.beam_size, temperatures=opts.temperature, allow_padding=opts.allow_padding, - vad=not opts.no_vad, + vad_threshold=opts.vad, ) logger.debug(f"Context: {ctx}") return ctx @@ -244,7 +246,6 @@ def is_valid_arg(opts) -> bool: "beam_size", "temperature", "allow_padding", - "no-vad", ] elif opts.mode == Mode.mic.value: keys = [ diff --git a/whispering/schema.py b/whispering/schema.py index ee0272f..e8b4e97 100644 --- a/whispering/schema.py +++ b/whispering/schema.py @@ -32,7 +32,6 @@ class Context(BaseModel, arbitrary_types_allowed=True): timestamp: float = 0.0 buffer_tokens: List[torch.Tensor] = [] buffer_mel: Optional[torch.Tensor] = None - vad: bool = True temperatures: List[float] allow_padding: bool = False @@ -46,7 +45,7 @@ class Context(BaseModel, arbitrary_types_allowed=True): logprob_threshold: Optional[float] = -1.0 compression_ratio_threshold: Optional[float] = 2.4 buffer_threshold: Optional[float] = 0.5 - vad_threshold: float = 0.5 + vad_threshold: float class ParsedChunk(BaseModel): diff --git a/whispering/transcriber.py b/whispering/transcriber.py index 878db24..45f5ddc 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -234,7 +234,7 @@ class WhisperStreamingTranscriber: ) -> Iterator[ParsedChunk]: logger.debug(f"{len(audio)}") - if ctx.vad: + if ctx.vad_threshold > 0.0: x = [ v for v in self.vad(