Viet-SpeechT5-TTS-finetuning

Sleeping

App Files Files Community

Viet-SpeechT5-TTS-finetuning / app.py

danhtran2mind

Update app.py

abea497 verified 2 months ago

raw

history blame

1.96 kB

	import gradio as gr
	import torch
	import soundfile as sf
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset
	import numpy as np

	# Load the processor, model, and vocoder
	processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
	model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Load speaker embeddings
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

	def generate_speech(text, voice):
	# Select speaker embedding based on voice choice
	speaker_dict = {"male": 2000,
	"female": 7000}

	speaker_id = speaker_dict[voice.lower()]
	speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)

	# Process the input text
	inputs = processor(text=text, return_tensors="pt")

	# Generate speech
	with torch.no_grad():
	speech = model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings=speaker_embedding,
	vocoder=vocoder,
	attention_mask=inputs.get("attention_mask")
	)

	# Save the generated speech to a WAV file
	output_path = "output_speech.wav"
	sf.write(output_path, speech.numpy(), samplerate=16000)
	return output_path

	# Create Gradio interface
	iface = gr.Interface(
	fn=generate_speech,
	inputs=[
	gr.Textbox(label="Enter Vietnamese Text", placeholder="Type your text here..."),
	gr.Radio(choices=["Male", "Female"], label="Select Voice", value="Male")
	],
	outputs=gr.Audio(label="Generated Speech"),
	title="Vietnamese Text-to-Speech with SpeechT5",
	description="Enter Vietnamese text and select a voice (Male or Female) to generate speech."
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch(debug=True)