mirror of
https://github.com/shirayu/whispering.git
synced 2024-11-25 18:31:00 +00:00
Removed --no-vad option and --vad option to set threshold
This commit is contained in:
parent
dce9719fea
commit
86f38c6ca9
5 changed files with 8 additions and 9 deletions
|
@ -41,7 +41,7 @@ whispering --language en --model tiny
|
||||||
- ``--no-progress`` disables the progress message
|
- ``--no-progress`` disables the progress message
|
||||||
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
|
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
|
||||||
- ``--debug`` outputs logs for debug
|
- ``--debug`` outputs logs for debug
|
||||||
- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
|
- ``--vad`` set VAD (Voice Activity Detection) threshold. 0 disables VAD and forces whisper to analyze non-voice activity sound period
|
||||||
- ``--output`` sets output file (Default: Standard output)
|
- ``--output`` sets output file (Default: Standard output)
|
||||||
|
|
||||||
### Parse interval
|
### Parse interval
|
||||||
|
|
|
@ -15,7 +15,6 @@ def test_options():
|
||||||
"--mode server --beam_size 3",
|
"--mode server --beam_size 3",
|
||||||
"--mode server --temperature 0",
|
"--mode server --temperature 0",
|
||||||
"--mode server --allow-padding",
|
"--mode server --allow-padding",
|
||||||
"--mode server --no-vad",
|
|
||||||
"--mode server --num_block 3",
|
"--mode server --num_block 3",
|
||||||
"--mode mic --host 0.0.0.0",
|
"--mode mic --host 0.0.0.0",
|
||||||
"--mode mic --port 8000",
|
"--mode mic --port 8000",
|
||||||
|
|
|
@ -149,8 +149,10 @@ def get_opts() -> argparse.Namespace:
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
group_ctx.add_argument(
|
group_ctx.add_argument(
|
||||||
"--no-vad",
|
"--vad",
|
||||||
action="store_true",
|
type=float,
|
||||||
|
help="Threshold of VAD",
|
||||||
|
default=0.5,
|
||||||
)
|
)
|
||||||
|
|
||||||
group_misc = parser.add_argument_group("Other options")
|
group_misc = parser.add_argument_group("Other options")
|
||||||
|
@ -223,7 +225,7 @@ def get_context(*, opts) -> Context:
|
||||||
beam_size=opts.beam_size,
|
beam_size=opts.beam_size,
|
||||||
temperatures=opts.temperature,
|
temperatures=opts.temperature,
|
||||||
allow_padding=opts.allow_padding,
|
allow_padding=opts.allow_padding,
|
||||||
vad=not opts.no_vad,
|
vad_threshold=opts.vad,
|
||||||
)
|
)
|
||||||
logger.debug(f"Context: {ctx}")
|
logger.debug(f"Context: {ctx}")
|
||||||
return ctx
|
return ctx
|
||||||
|
@ -244,7 +246,6 @@ def is_valid_arg(opts) -> bool:
|
||||||
"beam_size",
|
"beam_size",
|
||||||
"temperature",
|
"temperature",
|
||||||
"allow_padding",
|
"allow_padding",
|
||||||
"no-vad",
|
|
||||||
]
|
]
|
||||||
elif opts.mode == Mode.mic.value:
|
elif opts.mode == Mode.mic.value:
|
||||||
keys = [
|
keys = [
|
||||||
|
|
|
@ -32,7 +32,6 @@ class Context(BaseModel, arbitrary_types_allowed=True):
|
||||||
timestamp: float = 0.0
|
timestamp: float = 0.0
|
||||||
buffer_tokens: List[torch.Tensor] = []
|
buffer_tokens: List[torch.Tensor] = []
|
||||||
buffer_mel: Optional[torch.Tensor] = None
|
buffer_mel: Optional[torch.Tensor] = None
|
||||||
vad: bool = True
|
|
||||||
|
|
||||||
temperatures: List[float]
|
temperatures: List[float]
|
||||||
allow_padding: bool = False
|
allow_padding: bool = False
|
||||||
|
@ -46,7 +45,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
|
||||||
logprob_threshold: Optional[float] = -1.0
|
logprob_threshold: Optional[float] = -1.0
|
||||||
compression_ratio_threshold: Optional[float] = 2.4
|
compression_ratio_threshold: Optional[float] = 2.4
|
||||||
buffer_threshold: Optional[float] = 0.5
|
buffer_threshold: Optional[float] = 0.5
|
||||||
vad_threshold: float = 0.5
|
vad_threshold: float
|
||||||
|
|
||||||
|
|
||||||
class ParsedChunk(BaseModel):
|
class ParsedChunk(BaseModel):
|
||||||
|
|
|
@ -234,7 +234,7 @@ class WhisperStreamingTranscriber:
|
||||||
) -> Iterator[ParsedChunk]:
|
) -> Iterator[ParsedChunk]:
|
||||||
logger.debug(f"{len(audio)}")
|
logger.debug(f"{len(audio)}")
|
||||||
|
|
||||||
if ctx.vad:
|
if ctx.vad_threshold > 0.0:
|
||||||
x = [
|
x = [
|
||||||
v
|
v
|
||||||
for v in self.vad(
|
for v in self.vad(
|
||||||
|
|
Loading…
Reference in a new issue