From 32211534158b04e33a7f8313219a20c0b016a40d Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sun, 2 Oct 2022 22:46:01 +0900 Subject: [PATCH] Fix the behavior without --allow-padding --- whispering/transcriber.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/whispering/transcriber.py b/whispering/transcriber.py index 7753413..878db24 100644 --- a/whispering/transcriber.py +++ b/whispering/transcriber.py @@ -260,13 +260,22 @@ class WhisperStreamingTranscriber: seek: int = 0 while seek < mel.shape[-1]: + logger.debug(f"seek: {seek}") + if mel.shape[-1] - seek < N_FRAMES: + logger.debug( + f"mel.shape ({mel.shape[-1]}) - seek ({seek}) < N_FRAMES ({N_FRAMES})" + ) + if ctx.allow_padding: + logger.warning("Padding is not expected while speaking") + else: + logger.debug("No padding") + break + segment: torch.Tensor = ( pad_or_trim(mel[:, seek:], N_FRAMES) .to(self.model.device) # type: ignore .to(self.dtype) ) - if not ctx.allow_padding and segment.shape[-1] > mel.shape[-1]: - logger.warning("Padding is not expected while speaking") logger.debug( f"seek={seek}, timestamp={ctx.timestamp}, " @@ -309,9 +318,6 @@ class WhisperStreamingTranscriber: seek += last_timestamp_position * self.input_stride logger.debug(f"new seek={seek}, mel.shape: {mel.shape}") - if (not ctx.allow_padding) and (mel.shape[-1] - seek < N_FRAMES): - break - if mel.shape[-1] - seek <= 0: logger.debug(f"ctx.buffer_mel is None ({mel.shape}, {seek})") return