danhtran2mind's picture
Update app.py
abea497 verified
raw
history blame
1.96 kB
import gradio as gr
import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
# Load the processor, model, and vocoder
processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
def generate_speech(text, voice):
# Select speaker embedding based on voice choice
speaker_dict = {"male": 2000,
"female": 7000}
speaker_id = speaker_dict[voice.lower()]
speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
# Process the input text
inputs = processor(text=text, return_tensors="pt")
# Generate speech
with torch.no_grad():
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings=speaker_embedding,
vocoder=vocoder,
attention_mask=inputs.get("attention_mask")
)
# Save the generated speech to a WAV file
output_path = "output_speech.wav"
sf.write(output_path, speech.numpy(), samplerate=16000)
return output_path
# Create Gradio interface
iface = gr.Interface(
fn=generate_speech,
inputs=[
gr.Textbox(label="Enter Vietnamese Text", placeholder="Type your text here..."),
gr.Radio(choices=["Male", "Female"], label="Select Voice", value="Male")
],
outputs=gr.Audio(label="Generated Speech"),
title="Vietnamese Text-to-Speech with SpeechT5",
description="Enter Vietnamese text and select a voice (Male or Female) to generate speech."
)
# Launch the app
if __name__ == "__main__":
iface.launch(debug=True)