Removed --no-vad option and --vad option to set threshold

This commit is contained in:
Yuta Hayashibe 2022-10-15 13:33:39 +09:00
parent dce9719fea
commit 86f38c6ca9
5 changed files with 8 additions and 9 deletions

View file

@ -41,7 +41,7 @@ whispering --language en --model tiny
- ``--no-progress`` disables the progress message
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
- ``--debug`` outputs logs for debug
- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
- ``--vad`` set VAD (Voice Activity Detection) threshold. 0 disables VAD and forces whisper to analyze non-voice activity sound period
- ``--output`` sets output file (Default: Standard output)
### Parse interval

View file

@ -15,7 +15,6 @@ def test_options():
"--mode server --beam_size 3",
"--mode server --temperature 0",
"--mode server --allow-padding",
"--mode server --no-vad",
"--mode server --num_block 3",
"--mode mic --host 0.0.0.0",
"--mode mic --port 8000",

View file

@ -149,8 +149,10 @@ def get_opts() -> argparse.Namespace:
action="store_true",
)
group_ctx.add_argument(
"--no-vad",
action="store_true",
"--vad",
type=float,
help="Threshold of VAD",
default=0.5,
)
group_misc = parser.add_argument_group("Other options")
@ -223,7 +225,7 @@ def get_context(*, opts) -> Context:
beam_size=opts.beam_size,
temperatures=opts.temperature,
allow_padding=opts.allow_padding,
vad=not opts.no_vad,
vad_threshold=opts.vad,
)
logger.debug(f"Context: {ctx}")
return ctx
@ -244,7 +246,6 @@ def is_valid_arg(opts) -> bool:
"beam_size",
"temperature",
"allow_padding",
"no-vad",
]
elif opts.mode == Mode.mic.value:
keys = [

View file

@ -32,7 +32,6 @@ class Context(BaseModel, arbitrary_types_allowed=True):
timestamp: float = 0.0
buffer_tokens: List[torch.Tensor] = []
buffer_mel: Optional[torch.Tensor] = None
vad: bool = True
temperatures: List[float]
allow_padding: bool = False
@ -46,7 +45,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
logprob_threshold: Optional[float] = -1.0
compression_ratio_threshold: Optional[float] = 2.4
buffer_threshold: Optional[float] = 0.5
vad_threshold: float = 0.5
vad_threshold: float
class ParsedChunk(BaseModel):

View file

@ -234,7 +234,7 @@ class WhisperStreamingTranscriber:
) -> Iterator[ParsedChunk]:
logger.debug(f"{len(audio)}")
if ctx.vad:
if ctx.vad_threshold > 0.0:
x = [
v
for v in self.vad(