Spaces:
Runtime error
Runtime error
| from transformers import pipeline | |
| import torch | |
| import gradio as gr | |
| import subprocess | |
| import numpy as np | |
| import time | |
| p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-xls-r-300m") | |
| model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', | |
| model='silero_vad', force_reload=False, onnx=True) | |
| def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | |
| """ | |
| Helper function to read an audio file through ffmpeg. | |
| """ | |
| ar = f"{sampling_rate}" | |
| ac = "1" | |
| format_for_conversion = "f32le" | |
| ffmpeg_command = [ | |
| "ffmpeg", | |
| "-i", | |
| "pipe:0", | |
| "-ac", | |
| ac, | |
| "-ar", | |
| ar, | |
| "-f", | |
| format_for_conversion, | |
| "-hide_banner", | |
| "-loglevel", | |
| "quiet", | |
| "pipe:1", | |
| ] | |
| try: | |
| with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process: | |
| output_stream = ffmpeg_process.communicate(bpayload) | |
| except FileNotFoundError as error: | |
| raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error | |
| out_bytes = output_stream[0] | |
| audio = np.frombuffer(out_bytes, np.float32) | |
| if audio.shape[0] == 0: | |
| raise ValueError("Malformed soundfile") | |
| return audio | |
| (get_speech_timestamps, | |
| _, read_audio, | |
| *_) = utils | |
| def is_speech(wav, sr): | |
| speech_timestamps = get_speech_timestamps(wav, model, | |
| sampling_rate=sr) | |
| return len(speech_timestamps) > 0 | |
| def transcribe(audio, state={"text": "", "temp_text": "", "audio": ""}): | |
| if state is None: | |
| state={"text": "", "temp_text": "", "audio": ""} | |
| with open(audio, "rb") as f: | |
| payload = f.read() | |
| wav_data = ffmpeg_read(payload, sampling_rate=16000) | |
| _sr = 16000 | |
| speech = is_speech(wav_data, _sr) | |
| if(speech): | |
| if(state["audio"] is ""): | |
| state["audio"] = wav_data | |
| else: | |
| state["audio"] = np.concatenate((state["audio"], wav_data)) | |
| else: | |
| if(state["audio"] is not ""): | |
| text = p(state["audio"])["text"] + "\n" | |
| state["temp_text"] = text | |
| state["text"] += state["temp_text"] | |
| state["temp_text"] = "" | |
| state["audio"] = "" | |
| return f'{state["text"]} ( {state["temp_text"]} )', state | |
| gr.Interface( | |
| transcribe, | |
| [gr.Audio(source="microphone", type="filepath", streaming=True), "state"], | |
| [gr.Textbox(),"state"], | |
| live=True | |
| ).launch(server_name = "0.0.0.0") |