mirror of
https://github.com/shirayu/whispering.git
synced 2024-11-25 10:21:00 +00:00
Add --no-vad option
This commit is contained in:
parent
08798f117a
commit
7f15cfeb39
4 changed files with 20 additions and 11 deletions
|
@ -34,6 +34,7 @@ whispering --language en --model tiny
|
||||||
- ``--no-progress`` disables the progress message
|
- ``--no-progress`` disables the progress message
|
||||||
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
|
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
|
||||||
- ``--debug`` outputs logs for debug
|
- ``--debug`` outputs logs for debug
|
||||||
|
- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
|
||||||
|
|
||||||
### Parse interval
|
### Parse interval
|
||||||
|
|
||||||
|
|
|
@ -155,6 +155,10 @@ def get_opts() -> argparse.Namespace:
|
||||||
"--no-progress",
|
"--no-progress",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-vad",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
opts = parser.parse_args()
|
opts = parser.parse_args()
|
||||||
|
|
||||||
if opts.beam_size <= 0:
|
if opts.beam_size <= 0:
|
||||||
|
@ -187,6 +191,7 @@ def get_context(*, opts) -> Context:
|
||||||
beam_size=opts.beam_size,
|
beam_size=opts.beam_size,
|
||||||
temperatures=opts.temperature,
|
temperatures=opts.temperature,
|
||||||
allow_padding=opts.allow_padding,
|
allow_padding=opts.allow_padding,
|
||||||
|
vad=not opts.no_vad,
|
||||||
)
|
)
|
||||||
logger.debug(f"Context: {ctx}")
|
logger.debug(f"Context: {ctx}")
|
||||||
return ctx
|
return ctx
|
||||||
|
|
|
@ -27,6 +27,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
|
||||||
timestamp: float = 0.0
|
timestamp: float = 0.0
|
||||||
buffer_tokens: List[torch.Tensor] = []
|
buffer_tokens: List[torch.Tensor] = []
|
||||||
buffer_mel: Optional[torch.Tensor] = None
|
buffer_mel: Optional[torch.Tensor] = None
|
||||||
|
vad: bool = True
|
||||||
|
|
||||||
temperatures: List[float]
|
temperatures: List[float]
|
||||||
allow_padding: bool = False
|
allow_padding: bool = False
|
||||||
|
|
|
@ -233,17 +233,19 @@ class WhisperStreamingTranscriber:
|
||||||
ctx: Context,
|
ctx: Context,
|
||||||
) -> Iterator[ParsedChunk]:
|
) -> Iterator[ParsedChunk]:
|
||||||
logger.debug(f"{len(audio)}")
|
logger.debug(f"{len(audio)}")
|
||||||
x = [
|
|
||||||
v
|
if not ctx.vad:
|
||||||
for v in self.vad(
|
x = [
|
||||||
audio=audio,
|
v
|
||||||
total_block_number=1,
|
for v in self.vad(
|
||||||
)
|
audio=audio,
|
||||||
]
|
total_block_number=1,
|
||||||
if len(x) == 0: # No speech
|
)
|
||||||
logger.debug("No speech")
|
]
|
||||||
ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
|
if len(x) == 0: # No speech
|
||||||
return
|
logger.debug("No speech")
|
||||||
|
ctx.timestamp += len(audio) / N_FRAMES * self.duration_pre_one_mel
|
||||||
|
return
|
||||||
|
|
||||||
new_mel = log_mel_spectrogram(audio=audio)
|
new_mel = log_mel_spectrogram(audio=audio)
|
||||||
logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
|
logger.debug(f"Incoming new_mel.shape: {new_mel.shape}")
|
||||||
|
|
Loading…
Reference in a new issue