import gradio as gr
import torch
from transformers import pipeline

# 1. Choose a TTS model from Hugging Face
# You can later change this to another model, e.g. "suno/bark-small" if supported
TTS_MODEL_ID = "facebook/mms-tts-eng"  # English TTS

# 2. Create the TTS pipeline
device = 0 if torch.cuda.is_available() else -1
tts = pipeline("text-to-speech", model=TTS_MODEL_ID, device=device)

def synthesize_tts(text):
    if not text or text.strip() == "":
        raise gr.Error("Please enter some text to synthesize.")
    
    # 3. Run the pipeline
    out = tts(text)
    # out["audio"] is a numpy array; out["sampling_rate"] is the sample rate
    audio = (out["sampling_rate"], out["audio"])
    return audio

title = "Simple Text-to-Speech (TTS) Space"
description = (
    "Enter some English text and generate speech using a Hugging Face TTS model. "
    "You can later replace the model with F5-TTS for voice cloning."
)

with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    
    with gr.Row():
        with gr.Column():
            text_in = gr.Textbox(
                lines=4,
                label="Text to synthesize",
                placeholder="Type some English text here..."
            )
            btn = gr.Button("Generate Speech")
        with gr.Column():
            audio_out = gr.Audio(label="Generated audio", type="numpy")
    
    btn.click(fn=synthesize_tts, inputs=text_in, outputs=audio_out)

if __name__ == "__main__":
    demo.launch()