Spaces:
Running
Running
Irpan
commited on
Commit
·
bef8623
1
Parent(s):
ef107e3
asr
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import asr
|
|
|
|
| 3 |
# from tts import synthesize
|
| 4 |
|
| 5 |
|
|
@@ -24,31 +25,25 @@ mms_transcribe = gr.Interface(
|
|
| 24 |
allow_flagging="never",
|
| 25 |
)
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
# )
|
| 41 |
|
| 42 |
tabbed_interface = gr.TabbedInterface(
|
| 43 |
-
[mms_transcribe],
|
| 44 |
-
["Speech-to-text"],
|
| 45 |
)
|
| 46 |
|
| 47 |
-
# tabbed_interface = gr.TabbedInterface(
|
| 48 |
-
# [mms_transcribe, mms_synthesize],
|
| 49 |
-
# ["Speech-to-text", "Text-to-speech"],
|
| 50 |
-
# )
|
| 51 |
-
|
| 52 |
with gr.Blocks() as demo:
|
| 53 |
tabbed_interface.render()
|
| 54 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import asr
|
| 3 |
+
import tts
|
| 4 |
# from tts import synthesize
|
| 5 |
|
| 6 |
|
|
|
|
| 25 |
allow_flagging="never",
|
| 26 |
)
|
| 27 |
|
| 28 |
+
mms_synthesize = gr.Interface(
|
| 29 |
+
fn=tts.synthesize,
|
| 30 |
+
inputs=[
|
| 31 |
+
gr.Text(label="Input text"),
|
| 32 |
+
],
|
| 33 |
+
outputs=[
|
| 34 |
+
gr.Audio(label="Generated Audio", type="numpy"),
|
| 35 |
+
],
|
| 36 |
+
#examples=TTS_EXAMPLES,
|
| 37 |
+
title="Text-to-speech",
|
| 38 |
+
description=("Generate audio from input text."),
|
| 39 |
+
allow_flagging="never",
|
| 40 |
+
)
|
|
|
|
| 41 |
|
| 42 |
tabbed_interface = gr.TabbedInterface(
|
| 43 |
+
[mms_transcribe, mms_synthesize],
|
| 44 |
+
["Speech-to-text", "Text-to-speech"],
|
| 45 |
)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
with gr.Blocks() as demo:
|
| 48 |
tabbed_interface.render()
|
| 49 |
|
asr.py
CHANGED
|
@@ -12,7 +12,7 @@ import numpy as np
|
|
| 12 |
|
| 13 |
# Load processor and model
|
| 14 |
models_info = {
|
| 15 |
-
"
|
| 16 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
| 17 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
| 18 |
"ctc_model": False
|
|
@@ -27,7 +27,7 @@ models_info = {
|
|
| 27 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
| 28 |
"ctc_model": False
|
| 29 |
},
|
| 30 |
-
"
|
| 31 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
| 32 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
| 33 |
"ctc_model": True
|
|
|
|
| 12 |
|
| 13 |
# Load processor and model
|
| 14 |
models_info = {
|
| 15 |
+
"OpenAI-Whisper-Uzbek": {
|
| 16 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
| 17 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
| 18 |
"ctc_model": False
|
|
|
|
| 27 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
| 28 |
"ctc_model": False
|
| 29 |
},
|
| 30 |
+
"Meta-MMS": {
|
| 31 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
| 32 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
| 33 |
"ctc_model": True
|
tts.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import VitsModel, AutoTokenizer
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
# Load processor and model
|
| 5 |
+
models_info = {
|
| 6 |
+
"Meta-MMS": {
|
| 7 |
+
"processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
| 8 |
+
"model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
| 9 |
+
},
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
def synthesize(text, model_id):
|
| 13 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 14 |
+
processor = models_info[model_id]["processor"]
|
| 15 |
+
model = models_info[model_id]["model"].to(device)
|
| 16 |
+
inputs = processor(text, return_tensors="pt").to(device)
|
| 17 |
+
|
| 18 |
+
with torch.no_grad():
|
| 19 |
+
output = model(**inputs).waveform
|
| 20 |
+
|
| 21 |
+
sampling_rate = 22050
|
| 22 |
+
|
| 23 |
+
return (output.cpu(), sampling_rate)
|