| from typing import Dict | |
| from transformers.pipelines.audio_utils import ffmpeg_read | |
| import whisper | |
| import torch | |
| SAMPLE_RATE = 16000 | |
| class EndpointHandler(): | |
| def __init__(self, path=""): | |
| # load the model | |
| self.model = whisper.load_model("medium") | |
| def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: | |
| """ | |
| Args: | |
| data (:obj:): | |
| includes the deserialized audio file as bytes | |
| Return: | |
| A :obj:`dict`:. base64 encoded image | |
| """ | |
| # process input | |
| inputs = data.pop("inputs", data) | |
| audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) | |
| audio_tensor= torch.from_numpy(audio_nparray) | |
| # run inference pipeline | |
| result = self.model.transcribe(audio_nparray) | |
| # postprocess the prediction | |
| return {"text": result["text"]} | |