import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSpeechSeq2Seq
import soundfile as sf
import numpy as np
import tempfile


# Tên model TTS
MODEL_NAME = "hynt/F5-TTS-Vietnamese-100h"

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,   # CPU Basic → dùng float32
)

# Đảm bảo chạy CPU
device = torch.device("cpu")
model = model.to(device)


def tts_generate(text):
    if not text.strip():
        return None
    
    inputs = tokenizer(text, return_tensors="pt").to(device)

    # Sinh audio
    with torch.no_grad():
        audio = model.generate(**inputs)

    audio = audio.cpu().numpy().squeeze()

    # Lưu tạm wav
    tmp_path = tempfile.mktemp(suffix=".wav")
    sf.write(tmp_path, audio, 22050)

    return tmp_path


### Gradio UI ###
with gr.Blocks(title="TTS Vietnamese Free") as demo:
    gr.Markdown("# 🇻🇳 Text-to-Speech Vietnamese (Free Version)\nNhập văn bản tiếng Việt và nhấn **Sinh giọng nói**:")

    text_in = gr.Textbox(lines=5, label="Văn bản")
    audio_out = gr.Audio(label="Kết quả", type="filepath")
    btn = gr.Button("🎤 Sinh giọng nói")

    btn.click(tts_generate, inputs=text_in, outputs=audio_out)

demo.launch()