Merge branch 'master' of https://github.com/AlexandraRamassamy/whispering

2024-06-12 18:29:21 +00:00 · 2022-10-06 09:54:00 +02:00 · 2022-10-06 09:54:00 +02:00 · 8c8e7c05b7
parent 40354c4d97 686627e1b5
commit 8c8e7c05b7
9 changed files with 91 additions and 30 deletions
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@ -18,4 +18,4 @@ jobs:
      - uses: actions/checkout@v3

      - name: typos-action
-        uses: crate-ci/typos@v1.12.7
+        uses: crate-ci/typos@v1.12.8
--- a/README.md
+++ b/README.md
@ -17,6 +17,7 @@ Enough machine power is needed to transcribe in real time.
 pip install -U git+https://github.com/shirayu/whispering.git

 # If you use GPU, install proper torch and torchaudio
+# Check https://pytorch.org/get-started/locally/
 # Example : torch for CUDA 11.6
 pip install -U torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
 ```
@ -35,6 +36,7 @@ whispering --language en --model tiny
 - ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
 - ``--debug`` outputs logs for debug
 - ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
+- ``--output`` sets output file (Default: Standard output)

 ### Parse interval

--- a/package-lock.json
+++ b/package-lock.json
@ -9,7 +9,7 @@
            "version": "1.0.0",
            "devDependencies": {
                "markdownlint-cli": "^0.32.1",
-                "pyright": "^1.1.270"
+                "pyright": "^1.1.273"
            }
        },
        "node_modules/argparse": {
@ -256,9 +256,9 @@
            }
        },
        "node_modules/pyright": {
-            "version": "1.1.272",
-            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.272.tgz",
-            "integrity": "sha512-32AEfp7JwZ7aaSFoAObvw9CRNyctZT7UIs+4O2bBhAg1+2UaVRXUTbBeKqJD+hTuA9I+HYOLaCJWcGug70R7LA==",
+            "version": "1.1.273",
+            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.273.tgz",
+            "integrity": "sha512-uhBqKtRnC1Rvgz7uKp13VEwIR/UuqUvlscOu/y6hQhDzpFrZi0Gft7TrSLIMdy7fRAf85dS1nduQmAIWXgl4AA==",
            "dev": true,
            "bin": {
                "pyright": "index.js",
@ -502,9 +502,9 @@
            }
        },
        "pyright": {
-            "version": "1.1.272",
-            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.272.tgz",
-            "integrity": "sha512-32AEfp7JwZ7aaSFoAObvw9CRNyctZT7UIs+4O2bBhAg1+2UaVRXUTbBeKqJD+hTuA9I+HYOLaCJWcGug70R7LA==",
+            "version": "1.1.273",
+            "resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.273.tgz",
+            "integrity": "sha512-uhBqKtRnC1Rvgz7uKp13VEwIR/UuqUvlscOu/y6hQhDzpFrZi0Gft7TrSLIMdy7fRAf85dS1nduQmAIWXgl4AA==",
            "dev": true
        },
        "run-con": {
--- a/package.json
+++ b/package.json
@ -11,6 +11,6 @@
    "dependencies": {},
    "devDependencies": {
        "markdownlint-cli": "^0.32.1",
-        "pyright": "^1.1.270"
+        "pyright": "^1.1.273"
    }
 }
--- a/poetry.lock
+++ b/poetry.lock
@ -291,6 +291,20 @@ python-versions = ">=3.6.8"
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]

+[[package]]
+name = "PySoundFile"
+version = "0.9.0.post1"
+description = "An audio library based on libsndfile, CFFI and NumPy"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+cffi = ">=0.6"
+
+[package.extras]
+numpy = ["numpy"]
+
 [[package]]
 name = "PyYAML"
 version = "6.0"
@ -519,13 +533,13 @@ dev = ["pytest"]
 [package.source]
 type = "git"
 url = "https://github.com/openai/whisper.git"
-reference = '0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f'
-resolved_reference = "0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f"
+reference = '9e653bd0ea0f1e9493cb4939733e9de249493cfb'
+resolved_reference = "9e653bd0ea0f1e9493cb4939733e9de249493cfb"

 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.8,<3.11"
-content-hash = "ab527970383bc2245dee005627d0695812601115a36e15a5ef9e66d1185791bf"
+content-hash = "e03ed06253a5fa7329768e3c8ebb0874055f4cfe06ac1bd5e5b79e06157ba37e"

 [metadata.files]
 black = [
@ -774,6 +788,13 @@ pyparsing = [
    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
 ]
+PySoundFile = [
+    {file = "PySoundFile-0.9.0.post1-py2.py3-none-any.whl", hash = "sha256:db14f84f4af1910f54766cf0c0f19d52414fa80aa0e11cb338b5614946f39947"},
+    {file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-macosx_10_5_x86_64.macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:5889138553f4e675158054f8f41c212ca76ac0e2d949e38d1dd8ded4ca3f0ce0"},
+    {file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-win32.whl", hash = "sha256:c5c5cc8e5f3793a4b9f405c0c77e116e859ac16e065bb6b7f78f2a59484fd7a8"},
+    {file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-win_amd64.whl", hash = "sha256:d92afd505d395523200d5b7f217e409bae4639c90cc61e90832a57a5a0fb484a"},
+    {file = "PySoundFile-0.9.0.post1.tar.gz", hash = "sha256:43dd46a2afc0484c26930a7e59eef9365cee81bce7a4aadc5699f788f60d32c3"},
+]
 PyYAML = [
    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,5 +1,5 @@
 [tool.poetry]
-name = "whisper-streaming"
+name = "whispering"
 version = "0.1.0"
 description = ""
 authors = ["Yuta Hayashibe <yuta@hayashibe.jp>"]
@ -8,12 +8,13 @@ packages = [{include = "whispering"}]

 [tool.poetry.dependencies]
 python = ">=3.8,<3.11"
-whisper = {git = "https://github.com/openai/whisper.git", rev = '0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f'}
+whisper = {git = "https://github.com/openai/whisper.git", rev = '9e653bd0ea0f1e9493cb4939733e9de249493cfb'}
 sounddevice = "^0.4.5"
 pydantic = "^1.10.2"
 websockets = "^10.3"
 tqdm = "*"
 torchaudio = "^0.12.1"
+PySoundFile = {version = "^0.9.0.post1", platform = "windows"}


 [tool.poetry.group.dev.dependencies]
--- a/whispering/cli.py
+++ b/whispering/cli.py
@ -6,7 +6,8 @@ import queue
 import sys
 from enum import Enum
 from logging import DEBUG, INFO, basicConfig, getLogger
-from typing import Optional, Union
+from pathlib import Path
+from typing import Iterator, Optional, Union

 import sounddevice as sd
 import torch
@ -15,7 +16,7 @@ from whisper.audio import N_FRAMES, SAMPLE_RATE
 from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
 """
 from whispering.pbar import ProgressBar
-from whispering.schema import Context, WhisperConfig
+from whispering.schema import Context, StdoutWriter, WhisperConfig
 from whispering.serve import serve_with_websocket
 from whispering.transcriber import WhisperStreamingTranscriber
 from whispering.websocket_client import run_websocket_client
@ -45,7 +46,7 @@ def transcribe_from_mic(
    num_block: int,
    ctx: Context,
    no_progress: bool,
-) -> None:
+) -> Iterator[str]:
    q = queue.Queue()

    def sd_callback(indata, frames, time, status):
@ -88,7 +89,7 @@ def transcribe_from_mic(
                if not no_progress:
                    sys.stderr.write("\r")
                    sys.stderr.flush()
-                print(f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}")
+                yield f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}\n"
                if not no_progress:
                    sys.stderr.write("Analyzing")
                    sys.stderr.flush()
@ -156,6 +157,13 @@ def get_opts() -> argparse.Namespace:
    )

    group_misc = parser.add_argument_group("Other options")
+    group_misc.add_argument(
+        "--output",
+        "-o",
+        help="Output file",
+        type=Path,
+        default=StdoutWriter(),
+    )
    group_misc.add_argument(
        "--mic",
        help="Set MIC device",
@ -275,6 +283,7 @@ def main() -> None:
                    port=opts.port,
                    no_progress=opts.no_progress,
                    ctx=ctx,
+                    path_out=opts.output,
                )
            )
        except KeyboardInterrupt:
@ -295,13 +304,16 @@ def main() -> None:
        assert opts.model is not None
        wsp = get_wshiper(opts=opts)
        ctx: Context = get_context(opts=opts)
-        transcribe_from_mic(
-            wsp=wsp,
-            sd_device=opts.mic,
-            num_block=opts.num_block,
-            no_progress=opts.no_progress,
-            ctx=ctx,
-        )
+        with opts.output.open("w") as outf:
+            for text in transcribe_from_mic(
+                wsp=wsp,
+                sd_device=opts.mic,
+                num_block=opts.num_block,
+                no_progress=opts.no_progress,
+                ctx=ctx,
+            ):
+                outf.write(text)
+                outf.flush()


 if __name__ == "__main__":
--- a/whispering/schema.py
+++ b/whispering/schema.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3

+import sys
 from typing import List, Optional

 import numpy as np
@ -59,3 +60,20 @@ class SpeechSegment(BaseModel, arbitrary_types_allowed=True):
    start_block_idx: int
    end_block_idx: int
    audio: np.ndarray
+
+
+class StdoutWriter:
+    def open(self, *args, **kwargs):
+        return self
+
+    def __enter__(self, *args, **kwargs):
+        return self
+
+    def __exit__(self):
+        pass
+
+    def flush(self):
+        sys.stdout.flush()
+
+    def write(self, text):
+        sys.stdout.write(text)
--- a/whispering/websocket_client.py
+++ b/whispering/websocket_client.py
@ -3,14 +3,15 @@ import asyncio
 import json
 import sys
 from logging import getLogger
+from pathlib import Path
 from typing import Optional, Union

 import sounddevice as sd
 import websockets
 from whisper.audio import N_FRAMES, SAMPLE_RATE

-from schema import ParsedChunk
-from transcriber import Context
+from whispering.schema import ParsedChunk, StdoutWriter
+from whispering.transcriber import Context

 logger = getLogger(__name__)

@ -28,6 +29,7 @@ async def transcribe_from_mic_and_send(
    host: str,
    port: int,
    ctx: Context,
+    path_out: Union[Path, StdoutWriter],
 ) -> None:
    uri = f"ws://{host}:{port}"

@ -38,7 +40,7 @@ async def transcribe_from_mic_and_send(
        dtype="float32",
        channels=1,
        callback=sd_callback,
-    ):
+    ), path_out.open("w") as outf:
        async with websockets.connect(uri, max_size=999999999) as ws:  # type:ignore
            logger.debug("Sent context")
            v: str = ctx.json()
@ -69,10 +71,13 @@ async def transcribe_from_mic_and_send(
                        c = await asyncio.wait_for(recv(), timeout=0.5)
                        c_json = json.loads(c)
                        if (err := c_json.get("error")) is not None:
-                            print(f"Error: {err}")
+                            sys.stderr.write(f"Error: {err}\n")
                            sys.exit(1)
                        chunk = ParsedChunk.parse_obj(c_json)
-                        print(f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}")
+                        outf.write(
+                            f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}\n"
+                        )
+                        outf.flush()
                    except asyncio.TimeoutError:
                        break
                idx += 1
@ -86,6 +91,7 @@ async def run_websocket_client(
    port: int,
    ctx: Context,
    no_progress: bool,
+    path_out: Union[Path, StdoutWriter],
 ) -> None:
    global q
    global loop
@ -98,4 +104,5 @@ async def run_websocket_client(
        host=host,
        port=port,
        ctx=ctx,
+        path_out=path_out,
    )