Viet-SpeechT5-TTS-finetuning

Sleeping

File size: 5,158 Bytes

839d97c
4ec1a3f
 
12858c6
 
4ec1a3f
1efd737
 
 
 
 
 
c4523c8
1efd737
ba43a84
 
 
 
 
1efd737
 
 
 
 
ba43a84
1efd737
 
 
ba43a84
 
 
 
1efd737
 
ba43a84
1efd737
839d97c
c4523c8
ba43a84
 
 
 
 
 
 
 
 
839d97c
12858c6
ba43a84
 
 
 
 
 
 
12858c6
ba43a84
 
 
 
 
1efd737
ba43a84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839d97c
ba43a84
 
 
 
 
c4523c8
 
ba43a84
1efd737
ba43a84
 
1efd737
839d97c
c4523c8
 
ba43a84
c4523c8
ba43a84
 
 
e7bceb1
ba43a84
 
 
 
 
c4523c8
ba43a84
c4523c8
 
 
ba43a84
 
 
 
 
 
 
 
 
 
c4523c8
839d97c
c4523c8
4ec1a3f
ba43a84

import gradio as gr
import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import json
import os

# Directory containing config files
CONFIG_DIR = "assets/Viet-SpeechT5-TTS-finetuning"

# Load all config.json files
def load_configs(directory):
    print(f"Searching for config files in: {directory}")
    if not os.path.exists(directory):
        print(f"Directory {directory} does not exist.")
        return []
    
    examples = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file == "config.json":
                file_path = os.path.join(root, file)
                print(f"Found config file: {file_path}")
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        config = json.load(f)
                        if "input_text" in config and "voice" in config and "output_audio_path" in config:
                            examples.append([config["input_text"], config["voice"], os.path.join(root, config["output_audio_path"])])
                        else:
                            print(f"Skipping {file_path}: Missing required fields")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    print(f"Total examples loaded: {len(examples)}")
    return examples

# Load processor, model, and vocoder
print("Loading processor, model, and vocoder...")
try:
    processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
    model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    print("Models loaded successfully.")
except Exception as e:
    print(f"Error loading models: {e}")
    raise

# Load speaker embeddings
print("Loading speaker embeddings...")
try:
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    print("Speaker embeddings loaded successfully.")
except Exception as e:
    print(f"Error loading embeddings: {e}")
    raise

def generate_speech(text, voice, output_path="output_speech.wav"):
    print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
    if not text or not voice:
        return None, "Please provide both text and voice selection."
    
    speaker_dict = {"male": 2000, "female": 7000}
    try:
        speaker_id = speaker_dict[voice.lower()]
        speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
        inputs = processor(text=text, return_tensors="pt")
        
        with torch.no_grad():
            speech = model.generate_speech(
                inputs["input_ids"],
                speaker_embeddings=speaker_embedding,
                vocoder=vocoder,
                attention_mask=inputs.get("attention_mask")
            )
        
        sf.write(output_path, speech.numpy(), samplerate=16000)
        print(f"Audio saved to {output_path}")
        return output_path, None
    except Exception as e:
        print(f"Error generating speech: {str(e)}")
        return None, f"Error generating speech: {str(e)}"

def load_existing_audio(text, voice, output_audio_path):
    print(f"Load: {output_audio_path}")
    if not output_audio_path:
        return text, voice, None, "Please select an existing audio file."
    return text, voice, output_audio_path, "Successfully Loaded Example Audio"

# Load examples
print("Loading examples...")
examples = load_configs(CONFIG_DIR)
if not examples:
    print("Warning: No examples loaded. Check CONFIG_DIR and config.json files.")

# Create Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# Vietnamese Text-to-Speech")
    gr.Markdown("Generate speech from Vietnamese text using SpeechT5 or load existing audio from examples.")
    
    # Arrange components in a row
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Vietnamese Text", placeholder="Enter text here...", lines=5)
            voice_input = gr.Radio(choices=["Male", "Female"], label="Voice", value="Male")
            
        with gr.Column():
            output = gr.Audio(label="Generated Speech") 
            error_output = gr.Textbox(label="Error Message")
    
    # Button to generate speech
    gr.Button("Generate").click(
        fn=generate_speech,
        inputs=[text_input, voice_input],
        outputs=[output, error_output]
    )
    
    # Examples component
    gr.Examples(
        examples=examples,
        fn=load_existing_audio,
        inputs=[text_input, voice_input, output],
        outputs=[text_input, voice_input, output, error_output],
        label="Examples (Loads existing audio)"
    )

# Launch app
if __name__ == "__main__":
    print("Launching Gradio interface...")
    try:
        iface.launch()
        print("Gradio interface launched. Open your browser to http://localhost:7860")
    except Exception as e:
        print(f"Error launching Gradio interface: {e}")