This commit is contained in:
Alexandra Ramassamy 2022-10-06 09:54:00 +02:00
commit 8c8e7c05b7
9 changed files with 91 additions and 30 deletions

View file

@ -18,4 +18,4 @@ jobs:
- uses: actions/checkout@v3
- name: typos-action
uses: crate-ci/typos@v1.12.7
uses: crate-ci/typos@v1.12.8

View file

@ -17,6 +17,7 @@ Enough machine power is needed to transcribe in real time.
pip install -U git+https://github.com/shirayu/whispering.git
# If you use GPU, install proper torch and torchaudio
# Check https://pytorch.org/get-started/locally/
# Example : torch for CUDA 11.6
pip install -U torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
```
@ -35,6 +36,7 @@ whispering --language en --model tiny
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
- ``--debug`` outputs logs for debug
- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
- ``--output`` sets output file (Default: Standard output)
### Parse interval

14
package-lock.json generated
View file

@ -9,7 +9,7 @@
"version": "1.0.0",
"devDependencies": {
"markdownlint-cli": "^0.32.1",
"pyright": "^1.1.270"
"pyright": "^1.1.273"
}
},
"node_modules/argparse": {
@ -256,9 +256,9 @@
}
},
"node_modules/pyright": {
"version": "1.1.272",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.272.tgz",
"integrity": "sha512-32AEfp7JwZ7aaSFoAObvw9CRNyctZT7UIs+4O2bBhAg1+2UaVRXUTbBeKqJD+hTuA9I+HYOLaCJWcGug70R7LA==",
"version": "1.1.273",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.273.tgz",
"integrity": "sha512-uhBqKtRnC1Rvgz7uKp13VEwIR/UuqUvlscOu/y6hQhDzpFrZi0Gft7TrSLIMdy7fRAf85dS1nduQmAIWXgl4AA==",
"dev": true,
"bin": {
"pyright": "index.js",
@ -502,9 +502,9 @@
}
},
"pyright": {
"version": "1.1.272",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.272.tgz",
"integrity": "sha512-32AEfp7JwZ7aaSFoAObvw9CRNyctZT7UIs+4O2bBhAg1+2UaVRXUTbBeKqJD+hTuA9I+HYOLaCJWcGug70R7LA==",
"version": "1.1.273",
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.273.tgz",
"integrity": "sha512-uhBqKtRnC1Rvgz7uKp13VEwIR/UuqUvlscOu/y6hQhDzpFrZi0Gft7TrSLIMdy7fRAf85dS1nduQmAIWXgl4AA==",
"dev": true
},
"run-con": {

View file

@ -11,6 +11,6 @@
"dependencies": {},
"devDependencies": {
"markdownlint-cli": "^0.32.1",
"pyright": "^1.1.270"
"pyright": "^1.1.273"
}
}

27
poetry.lock generated
View file

@ -291,6 +291,20 @@ python-versions = ">=3.6.8"
[package.extras]
diagrams = ["jinja2", "railroad-diagrams"]
[[package]]
name = "PySoundFile"
version = "0.9.0.post1"
description = "An audio library based on libsndfile, CFFI and NumPy"
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
cffi = ">=0.6"
[package.extras]
numpy = ["numpy"]
[[package]]
name = "PyYAML"
version = "6.0"
@ -519,13 +533,13 @@ dev = ["pytest"]
[package.source]
type = "git"
url = "https://github.com/openai/whisper.git"
reference = '0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f'
resolved_reference = "0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f"
reference = '9e653bd0ea0f1e9493cb4939733e9de249493cfb'
resolved_reference = "9e653bd0ea0f1e9493cb4939733e9de249493cfb"
[metadata]
lock-version = "1.1"
python-versions = ">=3.8,<3.11"
content-hash = "ab527970383bc2245dee005627d0695812601115a36e15a5ef9e66d1185791bf"
content-hash = "e03ed06253a5fa7329768e3c8ebb0874055f4cfe06ac1bd5e5b79e06157ba37e"
[metadata.files]
black = [
@ -774,6 +788,13 @@ pyparsing = [
{file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
{file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
]
PySoundFile = [
{file = "PySoundFile-0.9.0.post1-py2.py3-none-any.whl", hash = "sha256:db14f84f4af1910f54766cf0c0f19d52414fa80aa0e11cb338b5614946f39947"},
{file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-macosx_10_5_x86_64.macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:5889138553f4e675158054f8f41c212ca76ac0e2d949e38d1dd8ded4ca3f0ce0"},
{file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-win32.whl", hash = "sha256:c5c5cc8e5f3793a4b9f405c0c77e116e859ac16e065bb6b7f78f2a59484fd7a8"},
{file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-win_amd64.whl", hash = "sha256:d92afd505d395523200d5b7f217e409bae4639c90cc61e90832a57a5a0fb484a"},
{file = "PySoundFile-0.9.0.post1.tar.gz", hash = "sha256:43dd46a2afc0484c26930a7e59eef9365cee81bce7a4aadc5699f788f60d32c3"},
]
PyYAML = [
{file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
{file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},

View file

@ -1,5 +1,5 @@
[tool.poetry]
name = "whisper-streaming"
name = "whispering"
version = "0.1.0"
description = ""
authors = ["Yuta Hayashibe <yuta@hayashibe.jp>"]
@ -8,12 +8,13 @@ packages = [{include = "whispering"}]
[tool.poetry.dependencies]
python = ">=3.8,<3.11"
whisper = {git = "https://github.com/openai/whisper.git", rev = '0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f'}
whisper = {git = "https://github.com/openai/whisper.git", rev = '9e653bd0ea0f1e9493cb4939733e9de249493cfb'}
sounddevice = "^0.4.5"
pydantic = "^1.10.2"
websockets = "^10.3"
tqdm = "*"
torchaudio = "^0.12.1"
PySoundFile = {version = "^0.9.0.post1", platform = "windows"}
[tool.poetry.group.dev.dependencies]

View file

@ -6,7 +6,8 @@ import queue
import sys
from enum import Enum
from logging import DEBUG, INFO, basicConfig, getLogger
from typing import Optional, Union
from pathlib import Path
from typing import Iterator, Optional, Union
import sounddevice as sd
import torch
@ -15,7 +16,7 @@ from whisper.audio import N_FRAMES, SAMPLE_RATE
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
"""
from whispering.pbar import ProgressBar
from whispering.schema import Context, WhisperConfig
from whispering.schema import Context, StdoutWriter, WhisperConfig
from whispering.serve import serve_with_websocket
from whispering.transcriber import WhisperStreamingTranscriber
from whispering.websocket_client import run_websocket_client
@ -45,7 +46,7 @@ def transcribe_from_mic(
num_block: int,
ctx: Context,
no_progress: bool,
) -> None:
) -> Iterator[str]:
q = queue.Queue()
def sd_callback(indata, frames, time, status):
@ -88,7 +89,7 @@ def transcribe_from_mic(
if not no_progress:
sys.stderr.write("\r")
sys.stderr.flush()
print(f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}")
yield f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}\n"
if not no_progress:
sys.stderr.write("Analyzing")
sys.stderr.flush()
@ -156,6 +157,13 @@ def get_opts() -> argparse.Namespace:
)
group_misc = parser.add_argument_group("Other options")
group_misc.add_argument(
"--output",
"-o",
help="Output file",
type=Path,
default=StdoutWriter(),
)
group_misc.add_argument(
"--mic",
help="Set MIC device",
@ -275,6 +283,7 @@ def main() -> None:
port=opts.port,
no_progress=opts.no_progress,
ctx=ctx,
path_out=opts.output,
)
)
except KeyboardInterrupt:
@ -295,13 +304,16 @@ def main() -> None:
assert opts.model is not None
wsp = get_wshiper(opts=opts)
ctx: Context = get_context(opts=opts)
transcribe_from_mic(
wsp=wsp,
sd_device=opts.mic,
num_block=opts.num_block,
no_progress=opts.no_progress,
ctx=ctx,
)
with opts.output.open("w") as outf:
for text in transcribe_from_mic(
wsp=wsp,
sd_device=opts.mic,
num_block=opts.num_block,
no_progress=opts.no_progress,
ctx=ctx,
):
outf.write(text)
outf.flush()
if __name__ == "__main__":

View file

@ -1,5 +1,6 @@
#!/usr/bin/env python3
import sys
from typing import List, Optional
import numpy as np
@ -59,3 +60,20 @@ class SpeechSegment(BaseModel, arbitrary_types_allowed=True):
start_block_idx: int
end_block_idx: int
audio: np.ndarray
class StdoutWriter:
def open(self, *args, **kwargs):
return self
def __enter__(self, *args, **kwargs):
return self
def __exit__(self):
pass
def flush(self):
sys.stdout.flush()
def write(self, text):
sys.stdout.write(text)

View file

@ -3,14 +3,15 @@ import asyncio
import json
import sys
from logging import getLogger
from pathlib import Path
from typing import Optional, Union
import sounddevice as sd
import websockets
from whisper.audio import N_FRAMES, SAMPLE_RATE
from schema import ParsedChunk
from transcriber import Context
from whispering.schema import ParsedChunk, StdoutWriter
from whispering.transcriber import Context
logger = getLogger(__name__)
@ -28,6 +29,7 @@ async def transcribe_from_mic_and_send(
host: str,
port: int,
ctx: Context,
path_out: Union[Path, StdoutWriter],
) -> None:
uri = f"ws://{host}:{port}"
@ -38,7 +40,7 @@ async def transcribe_from_mic_and_send(
dtype="float32",
channels=1,
callback=sd_callback,
):
), path_out.open("w") as outf:
async with websockets.connect(uri, max_size=999999999) as ws: # type:ignore
logger.debug("Sent context")
v: str = ctx.json()
@ -69,10 +71,13 @@ async def transcribe_from_mic_and_send(
c = await asyncio.wait_for(recv(), timeout=0.5)
c_json = json.loads(c)
if (err := c_json.get("error")) is not None:
print(f"Error: {err}")
sys.stderr.write(f"Error: {err}\n")
sys.exit(1)
chunk = ParsedChunk.parse_obj(c_json)
print(f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}")
outf.write(
f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}\n"
)
outf.flush()
except asyncio.TimeoutError:
break
idx += 1
@ -86,6 +91,7 @@ async def run_websocket_client(
port: int,
ctx: Context,
no_progress: bool,
path_out: Union[Path, StdoutWriter],
) -> None:
global q
global loop
@ -98,4 +104,5 @@ async def run_websocket_client(
host=host,
port=port,
ctx=ctx,
path_out=path_out,
)