mirror of
https://github.com/shirayu/whispering.git
synced 2024-11-22 08:51:01 +00:00
Add --frame option
This commit is contained in:
parent
8b464ff85d
commit
53970f3b51
4 changed files with 15 additions and 4 deletions
|
@ -44,6 +44,7 @@ whispering --language en --model tiny
|
||||||
- ``--debug`` outputs logs for debug
|
- ``--debug`` outputs logs for debug
|
||||||
- ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. ``0`` disables VAD and forces whisper to analyze non-voice activity sound period. Try ``--vad 0`` if VAD prevents transcription.
|
- ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. ``0`` disables VAD and forces whisper to analyze non-voice activity sound period. Try ``--vad 0`` if VAD prevents transcription.
|
||||||
- ``--output`` sets output file (Default: Standard output)
|
- ``--output`` sets output file (Default: Standard output)
|
||||||
|
- ``--frame``: the number of minimum frames of mel spectrogram input for Whisper (default: ``3000``. i.e. 30 seconds)
|
||||||
|
|
||||||
### Parse interval
|
### Parse interval
|
||||||
|
|
||||||
|
@ -55,6 +56,7 @@ If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.
|
||||||
By default, whispering does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
|
By default, whispering does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
|
||||||
This is because the original Whisper assumes that the inputs are 30 seconds segments.
|
This is because the original Whisper assumes that the inputs are 30 seconds segments.
|
||||||
However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
|
However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
|
||||||
|
You can make the length of segments smaller with ``--frame`` option (default: 3000), but it sacrifices accuracy because this is not expected input for Whisper.
|
||||||
|
|
||||||
## Example of web socket
|
## Example of web socket
|
||||||
|
|
||||||
|
|
|
@ -156,6 +156,12 @@ def get_opts() -> argparse.Namespace:
|
||||||
help="Maximum number of skip to analyze because of nospeech",
|
help="Maximum number of skip to analyze because of nospeech",
|
||||||
default=16,
|
default=16,
|
||||||
)
|
)
|
||||||
|
group_ctx.add_argument(
|
||||||
|
"--frame",
|
||||||
|
type=int,
|
||||||
|
help="The number of minimum frames of mel spectrogram input for Whisper",
|
||||||
|
default=N_FRAMES,
|
||||||
|
)
|
||||||
|
|
||||||
group_misc = parser.add_argument_group("Other options")
|
group_misc = parser.add_argument_group("Other options")
|
||||||
group_misc.add_argument(
|
group_misc.add_argument(
|
||||||
|
@ -228,6 +234,7 @@ def get_context(*, opts) -> Context:
|
||||||
temperatures=opts.temperature,
|
temperatures=opts.temperature,
|
||||||
max_nospeech_skip=opts.max_nospeech_skip,
|
max_nospeech_skip=opts.max_nospeech_skip,
|
||||||
vad_threshold=opts.vad,
|
vad_threshold=opts.vad,
|
||||||
|
mel_frame_min_num=opts.frame,
|
||||||
)
|
)
|
||||||
logger.debug(f"Context: {ctx}")
|
logger.debug(f"Context: {ctx}")
|
||||||
return ctx
|
return ctx
|
||||||
|
|
|
@ -5,7 +5,8 @@ from typing import Final, List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from pydantic import BaseModel, root_validator
|
from pydantic import BaseModel, Field, root_validator
|
||||||
|
from whisper.audio import N_FRAMES
|
||||||
|
|
||||||
|
|
||||||
class WhisperConfig(BaseModel):
|
class WhisperConfig(BaseModel):
|
||||||
|
@ -24,7 +25,7 @@ class WhisperConfig(BaseModel):
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_002")
|
CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_003")
|
||||||
|
|
||||||
|
|
||||||
class Context(BaseModel, arbitrary_types_allowed=True):
|
class Context(BaseModel, arbitrary_types_allowed=True):
|
||||||
|
@ -47,6 +48,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
|
||||||
buffer_threshold: Optional[float] = 0.5
|
buffer_threshold: Optional[float] = 0.5
|
||||||
vad_threshold: float
|
vad_threshold: float
|
||||||
max_nospeech_skip: int
|
max_nospeech_skip: int
|
||||||
|
mel_frame_min_num: int = Field(N_FRAMES, ge=1, le=N_FRAMES)
|
||||||
|
|
||||||
data_type: str = "float32"
|
data_type: str = "float32"
|
||||||
|
|
||||||
|
|
|
@ -278,9 +278,9 @@ class WhisperStreamingTranscriber:
|
||||||
if mel.shape[-1] - seek <= 0:
|
if mel.shape[-1] - seek <= 0:
|
||||||
logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
|
logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
|
||||||
break
|
break
|
||||||
if mel.shape[-1] - seek < N_FRAMES:
|
if mel.shape[-1] - seek < ctx.mel_frame_min_num:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
|
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < ctx.mel_frame_min_num ({ctx.mel_frame_min_num})"
|
||||||
)
|
)
|
||||||
if force_padding:
|
if force_padding:
|
||||||
logger.debug("Padding")
|
logger.debug("Padding")
|
||||||
|
|
Loading…
Reference in a new issue