Merge branch 'master' into docker

2024-06-13 02:39:23 +00:00 · 2022-11-16 15:57:40 +01:00 · 2022-11-16 15:57:40 +01:00 · f5d3d57ca2
parent 87a80f65e6 da38f8205e
commit f5d3d57ca2
7 changed files with 28 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -14,7 +14,7 @@ Enough machine power is needed to transcribe in real time.
 ## Setup

 ```bash
-pip install -U git+https://github.com/shirayu/whispering.git@v0.6.3
+pip install -U git+https://github.com/shirayu/whispering.git@v0.6.4

 # If you use GPU, install proper torch and torchaudio
 # Check https://pytorch.org/get-started/locally/
@ -44,6 +44,7 @@ whispering --language en --model tiny
 - ``--debug`` outputs logs for debug
 - ``--vad`` sets VAD (Voice Activity Detection) threshold. The default is ``0.5``. ``0`` disables VAD and forces whisper to analyze non-voice activity sound period. Try ``--vad 0`` if VAD prevents transcription.
 - ``--output`` sets output file (Default: Standard output)
+- ``--frame``: the number of minimum frames of mel spectrogram input for Whisper (default: ``3000``. i.e. 30 seconds)

 ### Parse interval

@ -52,8 +53,10 @@ This interval is determined by the value of ``-n`` and its default is ``20``.
 When an interval is predicted as "silence", it will not be passed to whisper.
 If you want to disable VAD, please make VAD threshold 0 by adding ``--vad 0``.

-By default, Whisper does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
+By default, whispering does not perform analysis until the total length of the segments determined by VAD to have speech exceeds 30 seconds.
+This is because the original Whisper assumes that the inputs are 30 seconds segments.
 However, if silence segments appear 16 times (the default value of ``--max_nospeech_skip``) after speech is detected, the analysis is performed.
+You can make the length of segments smaller with ``--frame`` option (default: 3000), but it sacrifices accuracy because this is not expected input for Whisper.

 ## Example of web socket

--- a/package-lock.json
+++ b/package-lock.json
@ -9,7 +9,7 @@
            "version": "1.0.0",
            "devDependencies": {
                "markdownlint-cli": "^0.32.1",
-                "pyright": "^1.1.278"
+                "pyright": "^1.1.279"
            }
        },
        "node_modules/argparse": {
@ -259,9 +259,9 @@
            }
        },
        "node_modules/pyright": {
-            "version": "1.1.278",
-            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.278.tgz",
-            "integrity": "sha512-at3j7c1fFzB6Jl4+bpr9QPRC/+1gH2gAR/M6GIRS312CHE2JMt8FZRflTbuxEB8IfQAtR+l3YoRMoS1vqF28jw==",
+            "version": "1.1.279",
+            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.279.tgz",
+            "integrity": "sha512-Npj9xGdspkaCLUttTDYK6nSlw403vzFlfMXdwCVH8u6BvITnSH/pJHT+CyYvrILXNevRazgltT7JDbWg2VJEFA==",
            "dev": true,
            "bin": {
                "pyright": "index.js",
@ -505,9 +505,9 @@
            }
        },
        "pyright": {
-            "version": "1.1.278",
-            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.278.tgz",
-            "integrity": "sha512-at3j7c1fFzB6Jl4+bpr9QPRC/+1gH2gAR/M6GIRS312CHE2JMt8FZRflTbuxEB8IfQAtR+l3YoRMoS1vqF28jw==",
+            "version": "1.1.279",
+            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.279.tgz",
+            "integrity": "sha512-Npj9xGdspkaCLUttTDYK6nSlw403vzFlfMXdwCVH8u6BvITnSH/pJHT+CyYvrILXNevRazgltT7JDbWg2VJEFA==",
            "dev": true
        },
        "run-con": {
--- a/package.json
+++ b/package.json
@ -11,6 +11,6 @@
    "dependencies": {},
    "devDependencies": {
        "markdownlint-cli": "^0.32.1",
-        "pyright": "^1.1.278"
+        "pyright": "^1.1.279"
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,9 +1,9 @@
 [misc]
-stable_version = "0.6.3"
+stable_version = "0.6.4"

 [tool.poetry]
 name = "whispering"
-version = "0.6.4dev0"
+version = "0.6.5dev0"
 description = "Streaming transcriber with whisper"
 license = "MIT"
 authors = ["Yuta Hayashibe <yuta@hayashibe.jp>"]
--- a/whispering/cli.py
+++ b/whispering/cli.py
@ -161,6 +161,12 @@ def get_opts() -> argparse.Namespace:
        help="Maximum number of skip to analyze because of nospeech",
        default=16,
    )
+    group_ctx.add_argument(
+        "--frame",
+        type=int,
+        help="The number of minimum frames of mel spectrogram input for Whisper",
+        default=N_FRAMES,
+    )

    group_misc = parser.add_argument_group("Other options")
    group_misc.add_argument(
@ -233,6 +239,7 @@ def get_context(*, opts) -> Context:
        temperatures=opts.temperature,
        max_nospeech_skip=opts.max_nospeech_skip,
        vad_threshold=opts.vad,
+        mel_frame_min_num=opts.frame,
    )
    logger.debug(f"Context: {ctx}")
    return ctx
--- a/whispering/schema.py
+++ b/whispering/schema.py
@ -5,7 +5,8 @@ from typing import Final, List, Optional

 import numpy as np
 import torch
-from pydantic import BaseModel, root_validator
+from pydantic import BaseModel, Field, root_validator
+from whisper.audio import N_FRAMES


 class WhisperConfig(BaseModel):
@ -24,7 +25,7 @@ class WhisperConfig(BaseModel):
        return values


-CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_002")
+CURRENT_PROTOCOL_VERSION: Final[int] = int("000_006_003")


 class Context(BaseModel, arbitrary_types_allowed=True):
@ -47,6 +48,7 @@ class Context(BaseModel, arbitrary_types_allowed=True):
    buffer_threshold: Optional[float] = 0.5
    vad_threshold: float
    max_nospeech_skip: int
+    mel_frame_min_num: int = Field(N_FRAMES, ge=1, le=N_FRAMES)

    data_type: str = "float32"

--- a/whispering/transcriber.py
+++ b/whispering/transcriber.py
@ -274,9 +274,9 @@ class WhisperStreamingTranscriber:
            if mel.shape[-1] - seek <= 0:
                logger.debug(f"No more seek: mel.shape={mel.shape}, seek={seek}")
                break
-            if mel.shape[-1] - seek < N_FRAMES:
+            if mel.shape[-1] - seek < ctx.mel_frame_min_num:
                logger.debug(
-                    f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})"
+                    f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < ctx.mel_frame_min_num ({ctx.mel_frame_min_num})"
                )
                if force_padding:
                    logger.debug("Padding")