import gradio as gr import torch from transformers import pipeline # 1. Choose a TTS model from Hugging Face # You can later change this to another model, e.g. "suno/bark-small" if supported TTS_MODEL_ID = "facebook/mms-tts-eng" # English TTS # 2. Create the TTS pipeline device = 0 if torch.cuda.is_available() else -1 tts = pipeline("text-to-speech", model=TTS_MODEL_ID, device=device) def synthesize_tts(text): if not text or text.strip() == "": raise gr.Error("Please enter some text to synthesize.") # 3. Run the pipeline out = tts(text) # out["audio"] is a numpy array; out["sampling_rate"] is the sample rate audio = (out["sampling_rate"], out["audio"]) return audio title = "Simple Text-to-Speech (TTS) Space" description = ( "Enter some English text and generate speech using a Hugging Face TTS model. " "You can later replace the model with F5-TTS for voice cloning." ) with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) with gr.Row(): with gr.Column(): text_in = gr.Textbox( lines=4, label="Text to synthesize", placeholder="Type some English text here..." ) btn = gr.Button("Generate Speech") with gr.Column(): audio_out = gr.Audio(label="Generated audio", type="numpy") btn.click(fn=synthesize_tts, inputs=text_in, outputs=audio_out) if __name__ == "__main__": demo.launch()