Merge branch 'master' into docker

This commit is contained in:
ricwo 2022-11-16 15:57:40 +01:00
commit f5d3d57ca2
7 changed files with 28 additions and 16 deletions

View file

@ -14,7 +14,7 @@ Enough machine power is needed to transcribe in real time.
## Setup
```bash
pip install -U git+https://github.com/shirayu/whispering.git@v0.6.3
pip install -U git+https://github.com/shirayu/whispering.git@v0.6.4
# If you use GPU, install proper torch and torchaudio
# Check https://pytorch.org/get-started/locally/
@ -44,6 +44,7 @@ whispering --language en --model tiny
- ``--debug`` outputs logs for debug
- ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. ``0`` disables VAD and forces whisper to analyze non-voice activity sound period. Try ``--vad 0`` if VAD prevents transcription.
- ``--output`` sets output file (Default: Standard output)
- ``--frame``: the number of minimum frames of mel spectrogram input for Whisper (default: ``3000``. i.e. 30 seconds)
### Parse interval
@ -52,8 +53,10 @@ This interval is determined by the value of ``-n`` and its default is ``20``.
When an interval is predicted as "silence", it will not be passed to whisper.
If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.
By default, Whisper does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
By default, whispering does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
This is because the original Whisper assumes that the inputs are 30 seconds segments.
However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
You can make the length of segments smaller with ``--frame`` option (default: 3000), but it sacrifices accuracy because this is not expected input for Whisper.
## Example of web socket

14
package-lock.json generated
View file

@ -9,7 +9,7 @@
"version": "1.0.0",
"devDependencies": {
"markdownlint-cli": "^0.32.1",
"pyright": "^1.1.278"
"pyright": "^1.1.279"
}
},
"node_modules/argparse": {
@ -259,9 +259,9 @@
}
},
"node_modules/pyright": {
"version": "1.1.278",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.278.tgz",
"integrity": "sha512-at3j7c1fFzB6Jl4+bpr9QPRC/+1gH2gAR/M6GIRS312CHE2JMt8FZRflTbuxEB8IfQAtR+l3YoRMoS1vqF28jw==",
"version": "1.1.279",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.279.tgz",
"integrity": "sha512-Npj9xGdspkaCLUttTDYK6nSlw403vzFlfMXdwCVH8u6BvITnSH/pJHT+CyYvrILXNevRazgltT7JDbWg2VJEFA==",
"dev": true,
"bin": {
"pyright": "index.js",
@ -505,9 +505,9 @@
}
},
"pyright": {
"version": "1.1.278",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.278.tgz",
"integrity": "sha512-at3j7c1fFzB6Jl4+bpr9QPRC/+1gH2gAR/M6GIRS312CHE2JMt8FZRflTbuxEB8IfQAtR+l3YoRMoS1vqF28jw==",
"version": "1.1.279",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.279.tgz",
"integrity": "sha512-Npj9xGdspkaCLUttTDYK6nSlw403vzFlfMXdwCVH8u6BvITnSH/pJHT+CyYvrILXNevRazgltT7JDbWg2VJEFA==",
"dev": true
},
"run-con": {

View file

@ -11,6 +11,6 @@
"dependencies": {},
"devDependencies": {
"markdownlint-cli": "^0.32.1",
"pyright": "^1.1.278"
"pyright": "^1.1.279"
}
}

View file

@ -1,9 +1,9 @@
[misc]
stable_version = "0.6.3"
stable_version = "0.6.4"
[tool.poetry]
name = "whispering"
version = "0.6.4dev0"
version = "0.6.5dev0"
description = "Streaming transcriber with whisper"
license = "MIT"
authors = ["Yuta Hayashibe <yuta@hayashibe.jp>"]

View file

@ -161,6 +161,12 @@ def get_opts() -> argparse.Namespace:
help="Maximum number of skip to analyze because of nospeech",
default=16,
)
group_ctx.add_argument(
"--frame",
type=int,
help="The number of minimum frames of mel spectrogram input for Whisper",
default=N_FRAMES,
)
group_misc = parser.add_argument_group("Other options")
group_misc.add_argument(
@ -233,6 +239,7 @@ def get_context(*, opts) -> Context:
temperatures=opts.temperature,
max_nospeech_skip=opts.max_nospeech_skip,
vad_threshold=opts.vad,
mel_frame_min_num=opts.frame,
)
logger.debug(f"Context: {ctx}")
return ctx

View file

@ -5,7 +5,8 @@ from typing import Final, List, Optional
import numpy as np
import torch
from pydantic import BaseModel, root_validator
from pydantic import BaseModel, Field, root_validator
from whisper.audio import N_FRAMES
class WhisperConfig(BaseModel):
@ -24,7 +25,7 @@ class WhisperConfig(BaseModel):
return values
CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_002")
CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_003")
class Context(BaseModel, arbitrary_types_allowed=True):
@ -47,6 +48,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
buffer_threshold: Optional[float] = 0.5
vad_threshold: float
max_nospeech_skip: int
mel_frame_min_num: int = Field(N_FRAMES, ge=1, le=N_FRAMES)
data_type: str = "float32"

View file

@ -274,9 +274,9 @@ class WhisperStreamingTranscriber:
if mel.shape[-1] - seek <= 0:
logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
break
if mel.shape[-1] - seek < N_FRAMES:
if mel.shape[-1] - seek < ctx.mel_frame_min_num:
logger.debug(
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < ctx.mel_frame_min_num ({ctx.mel_frame_min_num})"
)
if force_padding:
logger.debug("Padding")