2022-09-23 10:20:11 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import queue
|
|
|
|
from logging import INFO, getLogger
|
2022-09-23 10:28:11 +00:00
|
|
|
from typing import Optional, Union
|
2022-09-23 10:20:11 +00:00
|
|
|
|
|
|
|
import sounddevice as sd
|
|
|
|
import torch
|
|
|
|
from whisper import available_models
|
|
|
|
from whisper.audio import N_FRAMES, SAMPLE_RATE
|
|
|
|
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
|
|
|
|
|
|
|
|
from whisper_streaming.schema import WhisperConfig
|
|
|
|
from whisper_streaming.transcriber import WhisperStreamingTranscriber
|
|
|
|
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
|
|
|
|
2022-09-23 10:28:11 +00:00
|
|
|
def transcribe_from_mic(
|
|
|
|
*,
|
|
|
|
config: WhisperConfig,
|
|
|
|
sd_device: Optional[Union[int, str]],
|
|
|
|
) -> None:
|
2022-09-23 10:20:11 +00:00
|
|
|
wsp = WhisperStreamingTranscriber(config=config)
|
|
|
|
q = queue.Queue()
|
|
|
|
|
|
|
|
def sd_callback(indata, frames, time, status):
|
|
|
|
if status:
|
|
|
|
logger.warning(status)
|
|
|
|
q.put(indata.ravel())
|
|
|
|
|
|
|
|
logger.info("Ready to transcribe")
|
|
|
|
with sd.InputStream(
|
|
|
|
samplerate=SAMPLE_RATE,
|
|
|
|
blocksize=N_FRAMES * 10, # FIXME
|
|
|
|
device=sd_device,
|
|
|
|
dtype="float32",
|
|
|
|
channels=1,
|
|
|
|
callback=sd_callback,
|
|
|
|
):
|
|
|
|
while True:
|
|
|
|
segment = q.get()
|
2022-09-23 11:03:00 +00:00
|
|
|
for chunk in wsp.transcribe(segment=segment):
|
|
|
|
print(f"{chunk.start}->{chunk.end}\t{chunk.text}")
|
2022-09-23 10:20:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get_opts() -> argparse.Namespace:
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
|
|
"--language",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
choices=sorted(LANGUAGES.keys())
|
|
|
|
+ sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
|
|
|
|
required=True,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--model",
|
|
|
|
type=str,
|
|
|
|
choices=available_models(),
|
|
|
|
required=True,
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--device",
|
|
|
|
default="cuda" if torch.cuda.is_available() else "cpu",
|
|
|
|
help="device to use for PyTorch inference",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--beam_size",
|
|
|
|
"-b",
|
|
|
|
type=int,
|
|
|
|
default=5,
|
|
|
|
)
|
2022-09-23 10:28:11 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--mic",
|
|
|
|
)
|
2022-09-23 10:20:11 +00:00
|
|
|
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
opts = get_opts()
|
|
|
|
logger.setLevel(INFO)
|
|
|
|
if opts.beam_size <= 0:
|
|
|
|
opts.beam_size = None
|
2022-09-23 10:28:11 +00:00
|
|
|
try:
|
|
|
|
opts.mic = int(opts.mic)
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
2022-09-23 10:20:11 +00:00
|
|
|
config = WhisperConfig(
|
|
|
|
model_name=opts.model,
|
|
|
|
language=opts.language,
|
|
|
|
device=opts.device,
|
|
|
|
beam_size=opts.beam_size,
|
|
|
|
)
|
2022-09-23 10:28:11 +00:00
|
|
|
transcribe_from_mic(
|
|
|
|
config=config,
|
|
|
|
sd_device=opts.mic,
|
|
|
|
)
|
2022-09-23 10:20:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|