import tempfile from pathlib import Path import gradio as gr import numpy as np import soundfile as sf import torch from dia.model import Dia MODEL_ID = "nari-labs/Dia-1.6B-0626" SAMPLE_RATE = 44100 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Dia.from_pretrained(MODEL_ID, device=device) def generate_audio(text, audio_prompt): if not text or text.isspace(): raise gr.Error("Text input cannot be empty.") prompt_path = None if audio_prompt is not None: sr, audio = audio_prompt if audio is not None and np.size(audio) > 0: with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as f: sf.write(f.name, audio, sr) prompt_path = f.name audio_out = model.generate( text=text, audio_prompt=prompt_path, cfg_scale=3.0, temperature=1.2, top_p=0.9, ) if prompt_path is not None and Path(prompt_path).exists(): try: Path(prompt_path).unlink() except OSError: pass audio_np = np.asarray(audio_out, dtype=np.float32) return SAMPLE_RATE, audio_np with gr.Blocks() as demo: gr.Markdown("# Dia 1.6B-0626 Text-to-Speech") with gr.Row(): with gr.Column(scale=1): text_in = gr.Textbox( label="Input text", lines=6, placeholder="Start with [S1] / [S2] tags, e.g.:\n[S1] Hello. [S2] Hi there.", ) audio_prompt_in = gr.Audio( label="Audio prompt (optional, voice cloning)", sources=["upload", "microphone"], type="numpy", ) btn = gr.Button("Generate", variant="primary") with gr.Column(scale=1): audio_out = gr.Audio( label="Generated audio", type="numpy", autoplay=False, ) btn.click( fn=generate_audio, inputs=[text_in, audio_prompt_in], outputs=[audio_out], api_name="generate", ) if __name__ == "__main__": demo.launch()