Spaces:
Sleeping
Sleeping
Irpan
commited on
Commit
·
1dfec92
1
Parent(s):
8377a77
asr
Browse files
app.py
CHANGED
|
@@ -13,13 +13,16 @@ mms_transcribe = gr.Interface(
|
|
| 13 |
label="Select Model for ASR",
|
| 14 |
value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
|
| 15 |
interactive=True
|
| 16 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
],
|
| 18 |
-
outputs="text",
|
| 19 |
#examples=ASR_EXAMPLES,
|
| 20 |
title="Speech-to-text",
|
| 21 |
description=(
|
| 22 |
-
"Transcribe audio from a microphone or input file."
|
| 23 |
),
|
| 24 |
#article=ASR_NOTE,
|
| 25 |
allow_flagging="never",
|
|
@@ -29,7 +32,7 @@ mms_synthesize = gr.Interface(
|
|
| 29 |
fn=tts.synthesize,
|
| 30 |
inputs=[
|
| 31 |
gr.Text(label="Input text"),
|
| 32 |
-
|
| 33 |
choices=[model for model in tts.models_info],
|
| 34 |
label="Select Model for TTS",
|
| 35 |
value="Meta-MMS",
|
|
|
|
| 13 |
label="Select Model for ASR",
|
| 14 |
value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
|
| 15 |
interactive=True
|
| 16 |
+
),
|
| 17 |
+
],
|
| 18 |
+
outputs=[
|
| 19 |
+
gr.Textbox(label="Uyghur Arabic Transcription"),
|
| 20 |
+
gr.Textbox(label="Uyghur Latin Transcription"),
|
| 21 |
],
|
|
|
|
| 22 |
#examples=ASR_EXAMPLES,
|
| 23 |
title="Speech-to-text",
|
| 24 |
description=(
|
| 25 |
+
"Transcribe Uyghur speech audio from a microphone or input file."
|
| 26 |
),
|
| 27 |
#article=ASR_NOTE,
|
| 28 |
allow_flagging="never",
|
|
|
|
| 32 |
fn=tts.synthesize,
|
| 33 |
inputs=[
|
| 34 |
gr.Text(label="Input text"),
|
| 35 |
+
gr.Dropdown(
|
| 36 |
choices=[model for model in tts.models_info],
|
| 37 |
label="Select Model for TTS",
|
| 38 |
value="Meta-MMS",
|
asr.py
CHANGED
|
@@ -9,49 +9,55 @@ from transformers import (
|
|
| 9 |
Wav2Vec2ForCTC
|
| 10 |
)
|
| 11 |
import numpy as np
|
|
|
|
| 12 |
|
| 13 |
# Load processor and model
|
| 14 |
models_info = {
|
| 15 |
"OpenAI-Whisper-Uzbek": {
|
| 16 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
| 17 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
| 18 |
-
"ctc_model": False
|
|
|
|
| 19 |
},
|
| 20 |
"ixxan/whisper-small-thugy20": {
|
| 21 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20"),
|
| 22 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-thugy20"),
|
| 23 |
-
"ctc_model": False
|
|
|
|
| 24 |
},
|
| 25 |
"ixxan/whisper-small-uyghur-common-voice": {
|
| 26 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
| 27 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
| 28 |
-
"ctc_model": False
|
|
|
|
| 29 |
},
|
| 30 |
"Meta-MMS": {
|
| 31 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
| 32 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
| 33 |
-
"ctc_model": True
|
|
|
|
| 34 |
},
|
| 35 |
"ixxan/wav2vec2-large-mms-1b-uyghur-latin": {
|
| 36 |
"processor": Wav2Vec2Processor.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
| 37 |
"model": Wav2Vec2ForCTC.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
| 38 |
-
"ctc_model": True
|
|
|
|
| 39 |
},
|
| 40 |
}
|
| 41 |
|
| 42 |
-
def transcribe(audio_data, model_id) -> str:
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
| 48 |
-
def transcribe_all_models(audio_data) -> dict:
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
|
| 54 |
-
def
|
| 55 |
# Load audio file
|
| 56 |
if not audio_data:
|
| 57 |
return "<<ERROR: Empty Audio Input>>"
|
|
@@ -97,4 +103,10 @@ def transcribe_with_model(audio_data, model_id) -> str:
|
|
| 97 |
generated_ids = model.generate(inputs["input_features"], max_length=225)
|
| 98 |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
Wav2Vec2ForCTC
|
| 10 |
)
|
| 11 |
import numpy as np
|
| 12 |
+
import util
|
| 13 |
|
| 14 |
# Load processor and model
|
| 15 |
models_info = {
|
| 16 |
"OpenAI-Whisper-Uzbek": {
|
| 17 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
| 18 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
| 19 |
+
"ctc_model": False,
|
| 20 |
+
"arabic_script": False
|
| 21 |
},
|
| 22 |
"ixxan/whisper-small-thugy20": {
|
| 23 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20"),
|
| 24 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-thugy20"),
|
| 25 |
+
"ctc_model": False,
|
| 26 |
+
"arabic_script": False
|
| 27 |
},
|
| 28 |
"ixxan/whisper-small-uyghur-common-voice": {
|
| 29 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
| 30 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
| 31 |
+
"ctc_model": False,
|
| 32 |
+
"arabic_script": False
|
| 33 |
},
|
| 34 |
"Meta-MMS": {
|
| 35 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
| 36 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
| 37 |
+
"ctc_model": True,
|
| 38 |
+
"arabic_script": True
|
| 39 |
},
|
| 40 |
"ixxan/wav2vec2-large-mms-1b-uyghur-latin": {
|
| 41 |
"processor": Wav2Vec2Processor.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
| 42 |
"model": Wav2Vec2ForCTC.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
| 43 |
+
"ctc_model": True,
|
| 44 |
+
"arabic_script": False
|
| 45 |
},
|
| 46 |
}
|
| 47 |
|
| 48 |
+
# def transcribe(audio_data, model_id) -> str:
|
| 49 |
+
# if model_id == "Compare All Models":
|
| 50 |
+
# return transcribe_all_models(audio_data)
|
| 51 |
+
# else:
|
| 52 |
+
# return transcribe_with_model(audio_data, model_id)
|
| 53 |
|
| 54 |
+
# def transcribe_all_models(audio_data) -> dict:
|
| 55 |
+
# transcriptions = {}
|
| 56 |
+
# for model_id in models_info.keys():
|
| 57 |
+
# transcriptions[model_id] = transcribe_with_model(audio_data, model_id)
|
| 58 |
+
# return transcriptions
|
| 59 |
|
| 60 |
+
def transcribe(audio_data, model_id) -> str:
|
| 61 |
# Load audio file
|
| 62 |
if not audio_data:
|
| 63 |
return "<<ERROR: Empty Audio Input>>"
|
|
|
|
| 103 |
generated_ids = model.generate(inputs["input_features"], max_length=225)
|
| 104 |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 105 |
|
| 106 |
+
if models_info[model_id]["arabic_script"]:
|
| 107 |
+
transcription_arabic = transcription
|
| 108 |
+
transcription_latin = util.ug_arab_to_latn(transcription)
|
| 109 |
+
else: # Latin script output
|
| 110 |
+
transcription_arabic = util.ug_latn_to_arab(transcription)
|
| 111 |
+
transcription_latin = transcription
|
| 112 |
+
return transcription_arabic, transcription_latin
|
tts.py
CHANGED
|
@@ -1,17 +1,21 @@
|
|
| 1 |
from transformers import VitsModel, AutoTokenizer
|
| 2 |
import torch
|
| 3 |
import scipy.io.wavfile
|
|
|
|
| 4 |
|
| 5 |
# Load processor and model
|
| 6 |
models_info = {
|
| 7 |
"Meta-MMS": {
|
| 8 |
"processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
| 9 |
"model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
|
|
|
| 10 |
},
|
| 11 |
}
|
| 12 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 13 |
|
| 14 |
def synthesize(text, model_id):
|
|
|
|
|
|
|
| 15 |
processor = models_info[model_id]["processor"]
|
| 16 |
model = models_info[model_id]["model"].to(device)
|
| 17 |
inputs = processor(text, return_tensors="pt").to(device)
|
|
@@ -20,7 +24,7 @@ def synthesize(text, model_id):
|
|
| 20 |
output = model(**inputs).waveform.cpu() # Move output back to CPU for saving
|
| 21 |
|
| 22 |
output_path = "tts_output.wav"
|
| 23 |
-
sample_rate =
|
| 24 |
scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
|
| 25 |
|
| 26 |
return output_path
|
|
|
|
| 1 |
from transformers import VitsModel, AutoTokenizer
|
| 2 |
import torch
|
| 3 |
import scipy.io.wavfile
|
| 4 |
+
import util
|
| 5 |
|
| 6 |
# Load processor and model
|
| 7 |
models_info = {
|
| 8 |
"Meta-MMS": {
|
| 9 |
"processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
| 10 |
"model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
| 11 |
+
"arabic_script": True
|
| 12 |
},
|
| 13 |
}
|
| 14 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 15 |
|
| 16 |
def synthesize(text, model_id):
|
| 17 |
+
if models_info[model_id]["arabic_script"]:
|
| 18 |
+
text = util.ug_latn_to_arab(text)
|
| 19 |
processor = models_info[model_id]["processor"]
|
| 20 |
model = models_info[model_id]["model"].to(device)
|
| 21 |
inputs = processor(text, return_tensors="pt").to(device)
|
|
|
|
| 24 |
output = model(**inputs).waveform.cpu() # Move output back to CPU for saving
|
| 25 |
|
| 26 |
output_path = "tts_output.wav"
|
| 27 |
+
sample_rate = model.config.sample_rate
|
| 28 |
scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
|
| 29 |
|
| 30 |
return output_path
|
util.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from umsc import UgMultiScriptConverter
|
| 2 |
+
|
| 3 |
+
# Initialize uyghur script converter
|
| 4 |
+
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
|
| 5 |
+
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
|