whispering/whisper_streaming/cli.py
2022-09-23 19:20:29 +09:00

92 lines
2.2 KiB
Python

#!/usr/bin/env python3
import argparse
import queue
from logging import INFO, getLogger
import sounddevice as sd
import torch
from whisper import available_models
from whisper.audio import N_FRAMES, SAMPLE_RATE
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
from whisper_streaming.schema import WhisperConfig
from whisper_streaming.transcriber import WhisperStreamingTranscriber
logger = getLogger(__name__)
def transcribe_from_mic(config: WhisperConfig) -> None:
sd_device = None
wsp = WhisperStreamingTranscriber(config=config)
q = queue.Queue()
def sd_callback(indata, frames, time, status):
if status:
logger.warning(status)
q.put(indata.ravel())
logger.info("Ready to transcribe")
with sd.InputStream(
samplerate=SAMPLE_RATE,
blocksize=N_FRAMES * 10, # FIXME
device=sd_device,
dtype="float32",
channels=1,
callback=sd_callback,
):
while True:
segment = q.get()
r = wsp.transcribe(segment=segment)
if r is not None:
print(r.text)
def get_opts() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--language",
type=str,
default=None,
choices=sorted(LANGUAGES.keys())
+ sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
required=True,
)
parser.add_argument(
"--model",
type=str,
choices=available_models(),
required=True,
)
parser.add_argument(
"--device",
default="cuda" if torch.cuda.is_available() else "cpu",
help="device to use for PyTorch inference",
)
parser.add_argument(
"--beam_size",
"-b",
type=int,
default=5,
)
return parser.parse_args()
def main() -> None:
opts = get_opts()
logger.setLevel(INFO)
if opts.beam_size <= 0:
opts.beam_size = None
config = WhisperConfig(
model_name=opts.model,
language=opts.language,
device=opts.device,
beam_size=opts.beam_size,
)
transcribe_from_mic(config=config)
if __name__ == "__main__":
main()