mirror of
https://github.com/shirayu/whispering.git
synced 2024-11-22 00:41:02 +00:00
Add --frame option
This commit is contained in:
parent
8b464ff85d
commit
53970f3b51
4 changed files with 15 additions and 4 deletions
|
@ -44,6 +44,7 @@ whispering --language en --model tiny
|
|||
- ``--debug`` outputs logs for debug
|
||||
- ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. ``0`` disables VAD and forces whisper to analyze non-voice activity sound period. Try ``--vad 0`` if VAD prevents transcription.
|
||||
- ``--output`` sets output file (Default: Standard output)
|
||||
- ``--frame``: the number of minimum frames of mel spectrogram input for Whisper (default: ``3000``. i.e. 30 seconds)
|
||||
|
||||
### Parse interval
|
||||
|
||||
|
@ -55,6 +56,7 @@ If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.
|
|||
By default, whispering does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
|
||||
This is because the original Whisper assumes that the inputs are 30 seconds segments.
|
||||
However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
|
||||
You can make the length of segments smaller with ``--frame`` option (default: 3000), but it sacrifices accuracy because this is not expected input for Whisper.
|
||||
|
||||
## Example of web socket
|
||||
|
||||
|
|
|
@ -156,6 +156,12 @@ def get_opts() -> argparse.Namespace:
|
|||
help="Maximum number of skip to analyze because of nospeech",
|
||||
default=16,
|
||||
)
|
||||
group_ctx.add_argument(
|
||||
"--frame",
|
||||
type=int,
|
||||
help="The number of minimum frames of mel spectrogram input for Whisper",
|
||||
default=N_FRAMES,
|
||||
)
|
||||
|
||||
group_misc = parser.add_argument_group("Other options")
|
||||
group_misc.add_argument(
|
||||
|
@ -228,6 +234,7 @@ def get_context(*, opts) -> Context:
|
|||
temperatures=opts.temperature,
|
||||
max_nospeech_skip=opts.max_nospeech_skip,
|
||||
vad_threshold=opts.vad,
|
||||
mel_frame_min_num=opts.frame,
|
||||
)
|
||||
logger.debug(f"Context: {ctx}")
|
||||
return ctx
|
||||
|
|
|
@ -5,7 +5,8 @@ from typing import Final, List, Optional
|
|||
|
||||
import numpy as np
|
||||
import torch
|
||||
from pydantic import BaseModel, root_validator
|
||||
from pydantic import BaseModel, Field, root_validator
|
||||
from whisper.audio import N_FRAMES
|
||||
|
||||
|
||||
class WhisperConfig(BaseModel):
|
||||
|
@ -24,7 +25,7 @@ class WhisperConfig(BaseModel):
|
|||
return values
|
||||
|
||||
|
||||
CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_002")
|
||||
CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_003")
|
||||
|
||||
|
||||
class Context(BaseModel, arbitrary_types_allowed=True):
|
||||
|
@ -47,6 +48,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
|
|||
buffer_threshold: Optional[float] = 0.5
|
||||
vad_threshold: float
|
||||
max_nospeech_skip: int
|
||||
mel_frame_min_num: int = Field(N_FRAMES, ge=1, le=N_FRAMES)
|
||||
|
||||
data_type: str = "float32"
|
||||
|
||||
|
|
|
@ -278,9 +278,9 @@ class WhisperStreamingTranscriber:
|
|||
if mel.shape[-1] - seek <= 0:
|
||||
logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
|
||||
break
|
||||
if mel.shape[-1] - seek < N_FRAMES:
|
||||
if mel.shape[-1] - seek < ctx.mel_frame_min_num:
|
||||
logger.debug(
|
||||
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
|
||||
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < ctx.mel_frame_min_num ({ctx.mel_frame_min_num})"
|
||||
)
|
||||
if force_padding:
|
||||
logger.debug("Padding")
|
||||
|
|
Loading…
Reference in a new issue