mirror of
https://github.com/shirayu/whispering.git
synced 2024-06-12 18:29:21 +00:00
Merge branch 'master' of https://github.com/AlexandraRamassamy/whispering
This commit is contained in:
commit
8c8e7c05b7
2
.github/workflows/typos.yml
vendored
2
.github/workflows/typos.yml
vendored
|
@ -18,4 +18,4 @@ jobs:
|
|||
- uses: actions/checkout@v3
|
||||
|
||||
- name: typos-action
|
||||
uses: crate-ci/typos@v1.12.7
|
||||
uses: crate-ci/typos@v1.12.8
|
||||
|
|
|
@ -17,6 +17,7 @@ Enough machine power is needed to transcribe in real time.
|
|||
pip install -U git+https://github.com/shirayu/whispering.git
|
||||
|
||||
# If you use GPU, install proper torch and torchaudio
|
||||
# Check https://pytorch.org/get-started/locally/
|
||||
# Example : torch for CUDA 11.6
|
||||
pip install -U torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
```
|
||||
|
@ -35,6 +36,7 @@ whispering --language en --model tiny
|
|||
- ``-t`` sets temperatures to decode. You can set several like ``-t 0.0 -t 0.1 -t 0.5``, but too many temperatures exhaust decoding time
|
||||
- ``--debug`` outputs logs for debug
|
||||
- ``--no-vad`` disables VAD (Voice Activity Detection). This forces whisper to analyze non-voice activity sound period
|
||||
- ``--output`` sets output file (Default: Standard output)
|
||||
|
||||
### Parse interval
|
||||
|
||||
|
|
14
package-lock.json
generated
14
package-lock.json
generated
|
@ -9,7 +9,7 @@
|
|||
"version": "1.0.0",
|
||||
"devDependencies": {
|
||||
"markdownlint-cli": "^0.32.1",
|
||||
"pyright": "^1.1.270"
|
||||
"pyright": "^1.1.273"
|
||||
}
|
||||
},
|
||||
"node_modules/argparse": {
|
||||
|
@ -256,9 +256,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/pyright": {
|
||||
"version": "1.1.272",
|
||||
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.272.tgz",
|
||||
"integrity": "sha512-32AEfp7JwZ7aaSFoAObvw9CRNyctZT7UIs+4O2bBhAg1+2UaVRXUTbBeKqJD+hTuA9I+HYOLaCJWcGug70R7LA==",
|
||||
"version": "1.1.273",
|
||||
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.273.tgz",
|
||||
"integrity": "sha512-uhBqKtRnC1Rvgz7uKp13VEwIR/UuqUvlscOu/y6hQhDzpFrZi0Gft7TrSLIMdy7fRAf85dS1nduQmAIWXgl4AA==",
|
||||
"dev": true,
|
||||
"bin": {
|
||||
"pyright": "index.js",
|
||||
|
@ -502,9 +502,9 @@
|
|||
}
|
||||
},
|
||||
"pyright": {
|
||||
"version": "1.1.272",
|
||||
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.272.tgz",
|
||||
"integrity": "sha512-32AEfp7JwZ7aaSFoAObvw9CRNyctZT7UIs+4O2bBhAg1+2UaVRXUTbBeKqJD+hTuA9I+HYOLaCJWcGug70R7LA==",
|
||||
"version": "1.1.273",
|
||||
"resolved": "https://registry.npmjs.org/pyright/-/pyright-1.1.273.tgz",
|
||||
"integrity": "sha512-uhBqKtRnC1Rvgz7uKp13VEwIR/UuqUvlscOu/y6hQhDzpFrZi0Gft7TrSLIMdy7fRAf85dS1nduQmAIWXgl4AA==",
|
||||
"dev": true
|
||||
},
|
||||
"run-con": {
|
||||
|
|
|
@ -11,6 +11,6 @@
|
|||
"dependencies": {},
|
||||
"devDependencies": {
|
||||
"markdownlint-cli": "^0.32.1",
|
||||
"pyright": "^1.1.270"
|
||||
"pyright": "^1.1.273"
|
||||
}
|
||||
}
|
||||
|
|
27
poetry.lock
generated
27
poetry.lock
generated
|
@ -291,6 +291,20 @@ python-versions = ">=3.6.8"
|
|||
[package.extras]
|
||||
diagrams = ["jinja2", "railroad-diagrams"]
|
||||
|
||||
[[package]]
|
||||
name = "PySoundFile"
|
||||
version = "0.9.0.post1"
|
||||
description = "An audio library based on libsndfile, CFFI and NumPy"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[package.dependencies]
|
||||
cffi = ">=0.6"
|
||||
|
||||
[package.extras]
|
||||
numpy = ["numpy"]
|
||||
|
||||
[[package]]
|
||||
name = "PyYAML"
|
||||
version = "6.0"
|
||||
|
@ -519,13 +533,13 @@ dev = ["pytest"]
|
|||
[package.source]
|
||||
type = "git"
|
||||
url = "https://github.com/openai/whisper.git"
|
||||
reference = '0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f'
|
||||
resolved_reference = "0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f"
|
||||
reference = '9e653bd0ea0f1e9493cb4939733e9de249493cfb'
|
||||
resolved_reference = "9e653bd0ea0f1e9493cb4939733e9de249493cfb"
|
||||
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = ">=3.8,<3.11"
|
||||
content-hash = "ab527970383bc2245dee005627d0695812601115a36e15a5ef9e66d1185791bf"
|
||||
content-hash = "e03ed06253a5fa7329768e3c8ebb0874055f4cfe06ac1bd5e5b79e06157ba37e"
|
||||
|
||||
[metadata.files]
|
||||
black = [
|
||||
|
@ -774,6 +788,13 @@ pyparsing = [
|
|||
{file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
|
||||
{file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
|
||||
]
|
||||
PySoundFile = [
|
||||
{file = "PySoundFile-0.9.0.post1-py2.py3-none-any.whl", hash = "sha256:db14f84f4af1910f54766cf0c0f19d52414fa80aa0e11cb338b5614946f39947"},
|
||||
{file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-macosx_10_5_x86_64.macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:5889138553f4e675158054f8f41c212ca76ac0e2d949e38d1dd8ded4ca3f0ce0"},
|
||||
{file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-win32.whl", hash = "sha256:c5c5cc8e5f3793a4b9f405c0c77e116e859ac16e065bb6b7f78f2a59484fd7a8"},
|
||||
{file = "PySoundFile-0.9.0.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-win_amd64.whl", hash = "sha256:d92afd505d395523200d5b7f217e409bae4639c90cc61e90832a57a5a0fb484a"},
|
||||
{file = "PySoundFile-0.9.0.post1.tar.gz", hash = "sha256:43dd46a2afc0484c26930a7e59eef9365cee81bce7a4aadc5699f788f60d32c3"},
|
||||
]
|
||||
PyYAML = [
|
||||
{file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
|
||||
{file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
[tool.poetry]
|
||||
name = "whisper-streaming"
|
||||
name = "whispering"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Yuta Hayashibe <yuta@hayashibe.jp>"]
|
||||
|
@ -8,12 +8,13 @@ packages = [{include = "whispering"}]
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8,<3.11"
|
||||
whisper = {git = "https://github.com/openai/whisper.git", rev = '0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f'}
|
||||
whisper = {git = "https://github.com/openai/whisper.git", rev = '9e653bd0ea0f1e9493cb4939733e9de249493cfb'}
|
||||
sounddevice = "^0.4.5"
|
||||
pydantic = "^1.10.2"
|
||||
websockets = "^10.3"
|
||||
tqdm = "*"
|
||||
torchaudio = "^0.12.1"
|
||||
PySoundFile = {version = "^0.9.0.post1", platform = "windows"}
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
|
|
@ -6,7 +6,8 @@ import queue
|
|||
import sys
|
||||
from enum import Enum
|
||||
from logging import DEBUG, INFO, basicConfig, getLogger
|
||||
from typing import Optional, Union
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional, Union
|
||||
|
||||
import sounddevice as sd
|
||||
import torch
|
||||
|
@ -15,7 +16,7 @@ from whisper.audio import N_FRAMES, SAMPLE_RATE
|
|||
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
|
||||
"""
|
||||
from whispering.pbar import ProgressBar
|
||||
from whispering.schema import Context, WhisperConfig
|
||||
from whispering.schema import Context, StdoutWriter, WhisperConfig
|
||||
from whispering.serve import serve_with_websocket
|
||||
from whispering.transcriber import WhisperStreamingTranscriber
|
||||
from whispering.websocket_client import run_websocket_client
|
||||
|
@ -45,7 +46,7 @@ def transcribe_from_mic(
|
|||
num_block: int,
|
||||
ctx: Context,
|
||||
no_progress: bool,
|
||||
) -> None:
|
||||
) -> Iterator[str]:
|
||||
q = queue.Queue()
|
||||
|
||||
def sd_callback(indata, frames, time, status):
|
||||
|
@ -88,7 +89,7 @@ def transcribe_from_mic(
|
|||
if not no_progress:
|
||||
sys.stderr.write("\r")
|
||||
sys.stderr.flush()
|
||||
print(f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}")
|
||||
yield f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}\n"
|
||||
if not no_progress:
|
||||
sys.stderr.write("Analyzing")
|
||||
sys.stderr.flush()
|
||||
|
@ -156,6 +157,13 @@ def get_opts() -> argparse.Namespace:
|
|||
)
|
||||
|
||||
group_misc = parser.add_argument_group("Other options")
|
||||
group_misc.add_argument(
|
||||
"--output",
|
||||
"-o",
|
||||
help="Output file",
|
||||
type=Path,
|
||||
default=StdoutWriter(),
|
||||
)
|
||||
group_misc.add_argument(
|
||||
"--mic",
|
||||
help="Set MIC device",
|
||||
|
@ -275,6 +283,7 @@ def main() -> None:
|
|||
port=opts.port,
|
||||
no_progress=opts.no_progress,
|
||||
ctx=ctx,
|
||||
path_out=opts.output,
|
||||
)
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
|
@ -295,13 +304,16 @@ def main() -> None:
|
|||
assert opts.model is not None
|
||||
wsp = get_wshiper(opts=opts)
|
||||
ctx: Context = get_context(opts=opts)
|
||||
transcribe_from_mic(
|
||||
wsp=wsp,
|
||||
sd_device=opts.mic,
|
||||
num_block=opts.num_block,
|
||||
no_progress=opts.no_progress,
|
||||
ctx=ctx,
|
||||
)
|
||||
with opts.output.open("w") as outf:
|
||||
for text in transcribe_from_mic(
|
||||
wsp=wsp,
|
||||
sd_device=opts.mic,
|
||||
num_block=opts.num_block,
|
||||
no_progress=opts.no_progress,
|
||||
ctx=ctx,
|
||||
):
|
||||
outf.write(text)
|
||||
outf.flush()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
@ -59,3 +60,20 @@ class SpeechSegment(BaseModel, arbitrary_types_allowed=True):
|
|||
start_block_idx: int
|
||||
end_block_idx: int
|
||||
audio: np.ndarray
|
||||
|
||||
|
||||
class StdoutWriter:
|
||||
def open(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def __enter__(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def __exit__(self):
|
||||
pass
|
||||
|
||||
def flush(self):
|
||||
sys.stdout.flush()
|
||||
|
||||
def write(self, text):
|
||||
sys.stdout.write(text)
|
||||
|
|
|
@ -3,14 +3,15 @@ import asyncio
|
|||
import json
|
||||
import sys
|
||||
from logging import getLogger
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
import sounddevice as sd
|
||||
import websockets
|
||||
from whisper.audio import N_FRAMES, SAMPLE_RATE
|
||||
|
||||
from schema import ParsedChunk
|
||||
from transcriber import Context
|
||||
from whispering.schema import ParsedChunk, StdoutWriter
|
||||
from whispering.transcriber import Context
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -28,6 +29,7 @@ async def transcribe_from_mic_and_send(
|
|||
host: str,
|
||||
port: int,
|
||||
ctx: Context,
|
||||
path_out: Union[Path, StdoutWriter],
|
||||
) -> None:
|
||||
uri = f"ws://{host}:{port}"
|
||||
|
||||
|
@ -38,7 +40,7 @@ async def transcribe_from_mic_and_send(
|
|||
dtype="float32",
|
||||
channels=1,
|
||||
callback=sd_callback,
|
||||
):
|
||||
), path_out.open("w") as outf:
|
||||
async with websockets.connect(uri, max_size=999999999) as ws: # type:ignore
|
||||
logger.debug("Sent context")
|
||||
v: str = ctx.json()
|
||||
|
@ -69,10 +71,13 @@ async def transcribe_from_mic_and_send(
|
|||
c = await asyncio.wait_for(recv(), timeout=0.5)
|
||||
c_json = json.loads(c)
|
||||
if (err := c_json.get("error")) is not None:
|
||||
print(f"Error: {err}")
|
||||
sys.stderr.write(f"Error: {err}\n")
|
||||
sys.exit(1)
|
||||
chunk = ParsedChunk.parse_obj(c_json)
|
||||
print(f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}")
|
||||
outf.write(
|
||||
f"{chunk.start:.2f}->{chunk.end:.2f}\t{chunk.text}\n"
|
||||
)
|
||||
outf.flush()
|
||||
except asyncio.TimeoutError:
|
||||
break
|
||||
idx += 1
|
||||
|
@ -86,6 +91,7 @@ async def run_websocket_client(
|
|||
port: int,
|
||||
ctx: Context,
|
||||
no_progress: bool,
|
||||
path_out: Union[Path, StdoutWriter],
|
||||
) -> None:
|
||||
global q
|
||||
global loop
|
||||
|
@ -98,4 +104,5 @@ async def run_websocket_client(
|
|||
host=host,
|
||||
port=port,
|
||||
ctx=ctx,
|
||||
path_out=path_out,
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue