"""
VibeVoice Gradio Demo - High-Quality Dialogue Generation Interface with Streaming Support
"""
import argparse, os, tempfile
import torch, spaces
import gradio as gr
from transformers.utils import logging
from transformers import set_seed
from cached_path import cached_path
from model import VibeVoiceDemo
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
DEFAULT_NUM_SPEAKERS = 1
model_local_dir= str(cached_path("hf://microsoft/VibeVoice-1.5B"))
#model_local_dir= "./ckpts/vibevoice"
#snapshot_download(repo_id="microsoft/VibeVoice-1.5B", local_dir=model_local_dir)
def create_demo_interface(demo_instance: VibeVoiceDemo):
"""Create the Gradio interface with streaming support."""
custom_css = ""
with gr.Blocks(
title="VibeVoice - AI Podcast Generator",
css=custom_css,
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="purple",
neutral_hue="slate",
)
) as interface:
# Header
gr.HTML("""
šļø Vibe Podcasting
Generating Long-form Multi-speaker AI Podcast with VibeVoice
""")
with gr.Row():
# Left column - Settings
with gr.Column(scale=1, elem_classes="settings-card"):
def process_and_refresh_voices(uploaded_files: list[tempfile._TemporaryFileWrapper]):
if not uploaded_files: return [gr.update() for _ in speaker_selections] + [None]
for f in uploaded_files:
demo_instance.available_voices[os.path.basename(f.name)] = f.name
new_choices = list(demo_instance.available_voices.keys())
return [gr.update(choices=new_choices) for _ in speaker_selections] + [None]
gr.Markdown("### šļø **Podcast Settings**")
# Number of speakers
num_speakers = gr.Slider(
minimum=1,
maximum=4,
step=1,
value=DEFAULT_NUM_SPEAKERS,
label="Number of Speakers",
elem_classes="slider-container"
)
# Speaker selection
gr.Markdown("### š **Speaker Selection**")
available_speaker_names = list(demo_instance.available_voices.keys())
#default_speakers = available_speaker_names[:4] if len(available_speaker_names) >= 4 else available_speaker_names
#default_speakers = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
default_speakers = available_speaker_names
speaker_selections = []
for i in range(4):
default_value = default_speakers[i] if i < len(default_speakers) else None
speaker = gr.Dropdown(
choices=available_speaker_names,
value=default_value,
label=f"Speaker {i+1}",
visible=(i < DEFAULT_NUM_SPEAKERS), # Initially show only first 2 speakers
elem_classes="speaker-item"
)
speaker_selections.append(speaker)
with gr.Accordion("š¤ Upload Custom Voices", open=True):
upload_audio = gr.File(label="Upload Voice Samples", file_count="multiple", file_types=["audio"])
process_upload_btn = gr.Button("Add Uploaded Voices to Speaker Selection")
process_upload_btn.click(fn=process_and_refresh_voices, inputs=upload_audio, outputs=speaker_selections + [upload_audio])
# Advanced settings
gr.Markdown("### āļø **Advanced Settings**")
# Sampling parameters (contains all generation settings)
with gr.Accordion("Generation Parameters", open=False):
cfg_scale = gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.3,
step=0.05,
label="CFG Scale (Guidance Strength)",
# info="Higher values increase adherence to text",
elem_classes="slider-container"
)
disable_voice_cloning = gr.Checkbox(
value=False,
label="Disable voice cloning (skip conditioning voice prompts)",
info="When enabled, sets is_prefill=False so the model ignores provided speaker audio."
)
# Right column - Generation
with gr.Column(scale=2, elem_classes="generation-card"):
gr.Markdown("### š **Script Input**")
script_input = gr.Textbox(
label="Conversation Script",
placeholder="""Enter your podcast script here. You can format it as:
Speaker 1: Welcome to our podcast today!
Speaker 2: Thanks for having me. I'm excited to discuss...
Or paste text directly and it will auto-assign speakers.""",
lines=12,
max_lines=20,
elem_classes="script-input"
)
# Button row with Random Example on the left and Generate on the right
with gr.Row():
# Random example button (now on the left)
random_example_btn = gr.Button(
"š² Random Example",
size="lg",
variant="secondary",
elem_classes="random-btn",
scale=1 # Smaller width
)
# Generate button (now on the right)
generate_btn = gr.Button(
"š Generate Podcast",
size="lg",
variant="primary",
elem_classes="generate-btn",
scale=2 # Wider than random button
)
# Stop button
stop_btn = gr.Button(
"š Stop Generation",
size="lg",
variant="stop",
elem_classes="stop-btn",
visible=False
)
# Streaming status indicator
streaming_status = gr.HTML(
value="""