|
|
import gradio as gr |
|
|
import torch |
|
|
import soundfile as sf |
|
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
|
from datasets import load_dataset |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning") |
|
|
model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning") |
|
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
|
|
|
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") |
|
|
|
|
|
def generate_speech(text, voice): |
|
|
|
|
|
speaker_dict = {"male": 2000, |
|
|
"female": 7000} |
|
|
|
|
|
speaker_id = speaker_dict[voice.lower()] |
|
|
speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0) |
|
|
|
|
|
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
speech = model.generate_speech( |
|
|
inputs["input_ids"], |
|
|
speaker_embeddings=speaker_embedding, |
|
|
vocoder=vocoder, |
|
|
attention_mask=inputs.get("attention_mask") |
|
|
) |
|
|
|
|
|
|
|
|
output_path = "output_speech.wav" |
|
|
sf.write(output_path, speech.numpy(), samplerate=16000) |
|
|
return output_path |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=generate_speech, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Enter Vietnamese Text", placeholder="Type your text here..."), |
|
|
gr.Radio(choices=["Male", "Female"], label="Select Voice", value="Male") |
|
|
], |
|
|
outputs=gr.Audio(label="Generated Speech"), |
|
|
title="Vietnamese Text-to-Speech with SpeechT5", |
|
|
description="Enter Vietnamese text and select a voice (Male or Female) to generate speech." |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch(debug=True) |