Spaces:

jordand
/

echo-tts-preview

Running on Zero

File size: 107,869 Bytes

# The code in this file is almost entirely written by LLMs and is much, much, much messier than it needs to be (at this point it's not clear to what extent it is even human-modifiable). We'd hope to improve this for any future local gradio release(s).

import tempfile
import os
import json
import time
import secrets
import logging
from pathlib import Path
from typing import Tuple, Any
from functools import partial

os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)

import warnings

# Suppress torchaudio TorchCodec parameter warnings
warnings.filterwarnings('ignore', message='.*encoding.*parameter is not fully supported by TorchCodec')
warnings.filterwarnings('ignore', message='.*bits_per_sample.*parameter is not directly supported by TorchCodec')
warnings.filterwarnings('ignore', message='.* is not used by TorchCodec AudioEncoder. Format is determined by the file extension.')

import gradio as gr
import numpy as np
import torch
import torchaudio
from huggingface_hub import snapshot_download
import spaces

from inference import (
    load_model_from_hf,
    load_fish_ae_from_hf,
    load_pca_state_from_hf,
    load_audio,
    ae_reconstruct,
    sample_pipeline
)
from samplers import sample_euler_cfg_any, GuidanceMode

import tarfile

# --------------------------------------------------------------------
### Configuration
MODEL_DTYPE = torch.bfloat16

FISH_AE_DTYPE = torch.float32
# FISH_AE_DTYPE = torch.bfloat16 # MAYBE SLIGHTLY WORSE QUALITY, IF YOU HAVE ROOM, MAYBE USE FLOAT32

USE_16_BIT_WAV = True  # Save WAV files as 16-bit PCM instead of 32-bit float

# Audio Prompt Library for Custom Audio Panel (included in repo)
AUDIO_PROMPT_FOLDER = Path("./prompt_audio")


# If not on Zero GPU, compile fish_ae encoder/decoder on initialization
COMPILE_FISH_IF_NOT_ON_ZERO_GPU = True

# Silentcipher watermarking configuration
USE_SILENTCIPHER = True  # Enable/disable audio watermarking
SILENTCIPHER_MESSAGE = [91, 57, 81, 60, 83]  # Watermark message (list of integers)
SILENTCIPHER_SDR = 47  # Message SDR in dB (higher = less perceptible but less robust)

# Get HF token from environment for private model access
HF_TOKEN = os.environ.get("HF_TOKEN", None)
# --------------------------------------------------------------------

# Check if running on Zero GPU (compile incompatible with Zero GPU)
IS_ZEROGPU = os.environ.get("SPACES_ZERO_GPU") is not None


# print("FISH_AE_DTYPE:", FISH_AE_DTYPE)
# print("IS_ZEROGPU:", IS_ZEROGPU)
# if IS_ZEROGPU:
#     print("Running on Zero GPU - model compilation disabled")
# else:
#     print("Not on Zero GPU - model compilation available")

def _safe_members(tf, prefix):
    if not prefix.endswith('/'):
        prefix += '/'
    for m in tf.getmembers():
        if not m.name.startswith(prefix):
            continue
        p = Path(m.name)
        if any(part == '..' for part in p.parts) or p.is_absolute():
            continue
        yield m

def ensure_tar_tree(repo_id: str, root: str, *, token: str | None = None, max_workers: int = 4):
    os.environ.setdefault('HF_HUB_ENABLE_HF_TRANSFER', '1')
    from huggingface_hub import snapshot_download
    base = Path(snapshot_download(repo_id=repo_id, repo_type='dataset',
        allow_patterns=[f'{root}.tar', 'index.jsonl', 'README.md', 'LICENSE'],
        resume_download=True, token=token, max_workers=max_workers))
    root_dir = base / root
    if root_dir.exists():
        return root_dir
    tar_path = base / f'{root}.tar'
    if not tar_path.exists():
        raise FileNotFoundError(f'Expected {tar_path} in snapshot')
    with tarfile.open(tar_path, 'r') as tf:
        tf.extractall(base, members=_safe_members(tf, root))
    return root_dir


EARS_PATH = ensure_tar_tree(repo_id="jordand/echo-embeddings-ears-tar", root="EARS", token=HF_TOKEN)
VCTK_PATH = ensure_tar_tree(repo_id="jordand/echo-embeddings-vctk-tar", root="VCTK", token=HF_TOKEN)
EXPRESSO_PATH = ensure_tar_tree(repo_id="jordand/echo-embeddings-expresso-tar", root="Expresso", token=HF_TOKEN)


from huggingface_hub import snapshot_download

HF_CUSTOM_PATH = Path(snapshot_download(
    repo_id="jordand/echo-embeddings-custom",
    repo_type="dataset",
    allow_patterns=[
        "HF-Custom/**/speaker_latent.safetensors",
        "HF-Custom/**/metadata.json",
        "HF-Custom/**/audio.mp3",
    ],
    token=HF_TOKEN,
) + "/HF-Custom")

TEMP_AUDIO_DIR = Path('./temp_gradio_audio')
TEMP_AUDIO_DIR.mkdir(parents=True, exist_ok=True)

# Helper functions for unique filenames and cleanup
def make_stem(prefix: str, user_id: str | None = None) -> str:
    """Create unique filename stem: prefix__user__timestamp_random or prefix__timestamp_random if no user_id."""
    ts = int(time.time() * 1000)
    rand = secrets.token_hex(4)
    if user_id:
        return f"{prefix}__{user_id}__{ts}_{rand}"
    return f"{prefix}__{ts}_{rand}"

def cleanup_temp_audio(dir_: Path, user_id: str | None, max_age_sec: int = 60 * 5):
    """Remove old files globally and all previous files for this user."""
    now = time.time()
    
    # 1) Global TTL: remove any file older than max_age_sec
    for p in dir_.glob("*"):
        try:
            if p.is_file() and (now - p.stat().st_mtime) > max_age_sec:
                p.unlink(missing_ok=True)
        except Exception:
            pass
    
    # 2) Per-user: remove ALL previous files for this user (we don't need to keep any)
    if user_id:
        for p in dir_.glob(f"*__{user_id}__*"):
            try:
                if p.is_file():
                    p.unlink(missing_ok=True)
            except Exception:
                pass

TEXT_PRESETS_PATH = Path('./text_presets.txt')

SAMPLER_PRESETS_PATH = Path('./sampler_presets.json')

# Global model variables (loaded lazily for Zero GPU)
model = None
model_compiled = None  # Separate compiled model for toggling
fish_ae = None
pca_state = None
silentcipher_model = None  # Silentcipher watermarking model
_model_compiled = False

def load_models():
    """Lazy load models on first use (required for Zero GPU)."""
    global model, model_compiled, fish_ae, pca_state, silentcipher_model
    if model is None:
        # print("Loading models from HuggingFace...")
        model = load_model_from_hf(dtype=MODEL_DTYPE, compile=False, token=HF_TOKEN)
        fish_ae = load_fish_ae_from_hf(compile=(COMPILE_FISH_IF_NOT_ON_ZERO_GPU and not IS_ZEROGPU), dtype=FISH_AE_DTYPE, token=HF_TOKEN)

        pca_state = load_pca_state_from_hf(token=HF_TOKEN)
        
        # Load silentcipher model if enabled
        if USE_SILENTCIPHER:
            try:
                import silentcipher
                # print("Loading silentcipher watermarking model...")
                silentcipher_model = silentcipher.get_model(model_type='44.1k', device='cuda')
                # print("Silentcipher model loaded successfully!")
            except Exception as e:
                print(f"Warning: Failed to load silentcipher model: {e}")
                print("Continuing without watermarking...")
        
        # print("Models loaded successfully!")
        # if not IS_ZEROGPU:
        #     print("Note: model_compiled will be created when you check 'Compile Model'")



def compile_model(should_compile):
    """Compile the model for faster inference."""
    global model, model_compiled, _model_compiled
    
    # If on Zero GPU, compilation is not supported
    if IS_ZEROGPU:
        return gr.update(value=False, interactive=False), gr.update(value="⚠️ Compile disabled on Zero GPU", visible=True)
    
    if not should_compile:
        # User unchecked - clear status and allow toggling
        return gr.update(value=False, interactive=True), gr.update(value="", visible=False)
    
    if _model_compiled:
        # Already compiled - just show status
        return gr.update(value=True, interactive=True), gr.update(value="✓ Model already compiled", visible=True)
    
    # Need to compile - disable checkbox temporarily and show status
    return gr.update(value=True, interactive=False), gr.update(value="⏳ Compiling... (1-3 minutes)", visible=True)


def do_compile():
    """Actually perform the compilation by creating a separate compiled model."""
    global model, model_compiled, _model_compiled
    
    # Skip if on Zero GPU
    if IS_ZEROGPU:
        return gr.update(value="⚠️ Compile disabled on Zero GPU", visible=True), gr.update(interactive=False)
    
    if _model_compiled:
        return gr.update(value="", visible=False), gr.update(interactive=True)
    
    try:
        # Load models first if not already loaded (needed for compilation)
        # Since Zero GPU can't compile, we can safely load eagerly here
        load_models()
        
        # print("Compiling model... This will take 1-3 minutes on first run.")
        # print("Creating a separate compiled model for toggling...")
        
        # Create a compiled version of the model
        model_compiled = torch.compile(model)
        model_compiled.get_kv_cache = torch.compile(model.get_kv_cache)
        model_compiled.get_kv_cache_from_precomputed_speaker_state = torch.compile(model.get_kv_cache_from_precomputed_speaker_state)
        
        _model_compiled = True
        # print("Compilation complete! You can now toggle between compiled/uncompiled.")
        return gr.update(value="", visible=False), gr.update(interactive=True)
    except Exception as e:
        print(f"Compilation failed: {str(e)}")
        return gr.update(value=f"✗ Compilation failed: {str(e)}", visible=True), gr.update(interactive=True)


def save_audio_with_format(audio_tensor: torch.Tensor, base_path: Path, filename: str, sample_rate: int, audio_format: str) -> Path:
    """Save audio in specified format, fallback to WAV if MP3 encoding fails."""
    if audio_format == "mp3":
        try:
            output_path = base_path / f"{filename}.mp3"
            # Try to save as MP3
            torchaudio.save(
                str(output_path),
                audio_tensor,
                sample_rate,
                format="mp3",
                encoding="mp3",
                bits_per_sample=None
            )
            # print(f"Successfully saved as MP3: {output_path}")
            return output_path
        except Exception as e:
            print(f"MP3 encoding failed: {e}, falling back to WAV")
            # Fallback to WAV
            output_path = base_path / f"{filename}.wav"
            if USE_16_BIT_WAV:
                torchaudio.save(str(output_path), audio_tensor, sample_rate, encoding="PCM_S", bits_per_sample=16)
            else:
                torchaudio.save(str(output_path), audio_tensor, sample_rate)
            return output_path
    else:
        # Save as WAV
        output_path = base_path / f"{filename}.wav"
        if USE_16_BIT_WAV:
            torchaudio.save(str(output_path), audio_tensor, sample_rate, encoding="PCM_S", bits_per_sample=16)
        else:
            torchaudio.save(str(output_path), audio_tensor, sample_rate)
        return output_path


@spaces.GPU
def generate_audio(
    text_prompt: str,
    speaker_st_path: str,
    speaker_audio_path: str,
    # Sampling parameters
    num_steps: int,
    rng_seed: int,
    cfg_mode: str,
    cfg_scale_text: float,
    cfg_scale_speaker: float,
    cfg_min_t: float,
    cfg_max_t: float,
    truncation_factor: float,
    rescale_k: float,
    rescale_sigma: float,
    speaker_k_enable: bool,
    speaker_k_scale: float,
    speaker_k_min_t: float,
    speaker_k_max_layers: int,
    apg_eta_text: float,
    apg_eta_speaker: float,
    apg_momentum_text: float,
    apg_momentum_speaker: float,
    apg_norm_text: str,
    apg_norm_speaker: str,
    reconstruct_first_30_seconds: bool,
    use_custom_shapes: bool,
    max_text_byte_length: str,
    max_speaker_latent_length: str,
    sample_latent_len: str,
    audio_format: str,
    use_compile: bool,
    show_original_audio: bool,
    session_id: str,
) -> Tuple[Any, Any, Any, Any, Any, Any, Any, Any]:
    """Generate audio using the model from the notebook."""
    
    # Load models on first use (required for Zero GPU)
    load_models()
    
    # Choose which model to use based on compile setting
    global model, model_compiled
    active_model = model_compiled if (use_compile and model_compiled is not None) else model
    
    if use_compile and model_compiled is None:
        print("Warning: Compile requested but model not yet compiled. Using uncompiled model.")
    
    # Cleanup old temp files globally and remove ALL previous files for this user
    cleanup_temp_audio(TEMP_AUDIO_DIR, session_id)
    
    # Check if speaker is provided (now optional for zero conditioning)
    use_zero_speaker = not speaker_audio_path or speaker_audio_path == ""
    if use_zero_speaker:
        speaker_audio_path = None
    
    start_time = time.time()
    
    # Parse parameters (most are already numeric from gr.Number)
    num_steps_int = min(max(int(num_steps), 1), 80)  # Clamp to [1, 80]
    rng_seed_int = int(rng_seed) if rng_seed is not None else 0
    cfg_scale_text_val = float(cfg_scale_text)
    cfg_min_t_val = float(cfg_min_t)
    cfg_max_t_val = float(cfg_max_t)
    truncation_factor_val = float(truncation_factor)
    rescale_k_val = float(rescale_k) if rescale_k != 1.0 else None  # 1.0 means "off"
    rescale_sigma_val = float(rescale_sigma)
    
    # Determine guidance mode
    if cfg_mode == "independent":
        guidance_mode = GuidanceMode.INDEPENDENT
        cfg_scale_speaker_val = float(cfg_scale_speaker) if cfg_scale_speaker is not None else None
        apg_eta_text_val = None
        apg_eta_speaker_val = None
        apg_momentum_text_val = None
        apg_momentum_speaker_val = None
        apg_norm_text_val = None
        apg_norm_speaker_val = None
    elif cfg_mode == "alternating":
        guidance_mode = GuidanceMode.ALTERNATING
        cfg_scale_speaker_val = float(cfg_scale_speaker) if cfg_scale_speaker is not None else None
        apg_eta_text_val = None
        apg_eta_speaker_val = None
        apg_momentum_text_val = None
        apg_momentum_speaker_val = None
        apg_norm_text_val = None
        apg_norm_speaker_val = None
    elif cfg_mode == "apg-independent":
        guidance_mode = GuidanceMode.APG
        cfg_scale_speaker_val = float(cfg_scale_speaker) if cfg_scale_speaker is not None else None
        apg_eta_text_val = float(apg_eta_text) if apg_eta_text is not None else None
        apg_eta_speaker_val = float(apg_eta_speaker) if apg_eta_speaker is not None else None
        apg_momentum_text_val = float(apg_momentum_text) if apg_momentum_text is not None else None
        apg_momentum_speaker_val = float(apg_momentum_speaker) if apg_momentum_speaker is not None else None
        apg_norm_text_val = float(apg_norm_text) if apg_norm_text.strip() else None
        apg_norm_speaker_val = float(apg_norm_speaker) if apg_norm_speaker.strip() else None
    else:  # "joint-unconditional"
        guidance_mode = GuidanceMode.JOINT
        # For unconditional, speaker scale must be None
        cfg_scale_speaker_val = None
        apg_eta_text_val = None
        apg_eta_speaker_val = None
        apg_momentum_text_val = None
        apg_momentum_speaker_val = None
        apg_norm_text_val = None
        apg_norm_speaker_val = None
    
    # Parse speaker K scale parameters (available for all modes)
    if speaker_k_enable:
        speaker_k_scale_val = float(speaker_k_scale) if speaker_k_scale is not None else None
        speaker_k_min_t_val = float(speaker_k_min_t) if speaker_k_min_t is not None else None
        speaker_k_max_layers_val = int(speaker_k_max_layers) if speaker_k_max_layers is not None else None
    else:
        speaker_k_scale_val = None
        speaker_k_min_t_val = None
        speaker_k_max_layers_val = None

    # Parse custom shapes if enabled
    if use_custom_shapes:
        # Allow blank/empty values for first two fields (will use None)
        pad_to_max_text_seq_len = int(max_text_byte_length) if max_text_byte_length.strip() else None
        pad_to_max_speaker_latent_len = int(max_speaker_latent_length) if max_speaker_latent_length.strip() else None
        sample_latent_len_val = int(sample_latent_len) if sample_latent_len.strip() else 640
    else:
        pad_to_max_text_seq_len = 768
        pad_to_max_speaker_latent_len = 2560
        sample_latent_len_val = 640
    
    # Create sample function with parameters
    sample_fn = partial(
        sample_euler_cfg_any,
        num_steps=num_steps_int,
        guidance_mode=guidance_mode,
        cfg_scale_text=cfg_scale_text_val,
        cfg_scale_speaker=cfg_scale_speaker_val,
        cfg_min_t=cfg_min_t_val,
        cfg_max_t=cfg_max_t_val,
        truncation_factor=truncation_factor_val,
        rescale_k=rescale_k_val,
        rescale_sigma=rescale_sigma_val,
        speaker_k_scale=speaker_k_scale_val,
        speaker_k_min_t=speaker_k_min_t_val,
        speaker_k_max_layers=speaker_k_max_layers_val,
        apg_eta_text=apg_eta_text_val,
        apg_eta_speaker=apg_eta_speaker_val,
        apg_momentum_text=apg_momentum_text_val,
        apg_momentum_speaker=apg_momentum_speaker_val,
        apg_norm_text=apg_norm_text_val,
        apg_norm_speaker=apg_norm_speaker_val,
        block_size=sample_latent_len_val
    )
    
    # Load speaker audio if provided
    if speaker_audio_path is not None:
        speaker_audio = load_audio(speaker_audio_path).cuda()
    else:
        speaker_audio = None
    
    # Generate audio using raw audio (with selected model - compiled or not)
    audio_out = sample_pipeline(
        model=active_model,
        fish_ae=fish_ae,
        pca_state=pca_state,
        sample_fn=sample_fn,
        text_prompt=text_prompt,
        speaker_audio=speaker_audio,
        rng_seed=rng_seed_int,
        pad_to_max_text_seq_len=pad_to_max_text_seq_len,
        pad_to_max_speaker_latent_len=pad_to_max_speaker_latent_len,
    )
    
    # Apply silentcipher watermarking if enabled
    audio_to_save = audio_out[0].cpu()
    if USE_SILENTCIPHER and silentcipher_model is not None:
        try:
            # print("Applying silentcipher watermark...")
            audio_numpy = audio_to_save.squeeze(0).numpy()
            encoded_audio, sdr = silentcipher_model.encode_wav(
                audio_numpy, 
                44100, 
                SILENTCIPHER_MESSAGE, 
                message_sdr=SILENTCIPHER_SDR
            )
            audio_to_save = torch.tensor(encoded_audio).unsqueeze(0)
            # print(f"Watermark applied successfully! SDR: {sdr:.2f} dB")
        except Exception as e:
            print(f"Warning: Watermarking failed: {e}")
            print("Saving audio without watermark...")

    # Save generated audio with format selection (unique filename per session)
    stem = make_stem("generated", session_id)
    output_path = save_audio_with_format(
        audio_to_save,
        TEMP_AUDIO_DIR,
        stem,
        44100,
        audio_format
    )
    
    # Calculate generation time
    generation_time = time.time() - start_time
    time_str = f"⏱️ Total generation time: {generation_time:.2f}s"
    
    # Format text prompt for display
    text_display = f"**Text Prompt:**\n\n{text_prompt}"
    
    # Prepare reconstruction and original audio based on checkboxes
    recon_output_path = None
    original_output_path = None
    
    # Optionally reconstruct first 30 seconds for reference
    if reconstruct_first_30_seconds and speaker_audio_path:
        audio_recon = ae_reconstruct(
            fish_ae=fish_ae,
            pca_state=pca_state,
            audio=torch.nn.functional.pad(
                speaker_audio[..., :2048 * 640], 
                (0, max(0, 2048 * 640 - speaker_audio.shape[-1]))
            )[None],
        )[..., :speaker_audio.shape[-1]]
        
        # Save reconstruction with same format (unique filename per session)
        recon_stem = make_stem("speaker_recon", session_id)
        recon_output_path = save_audio_with_format(
            audio_recon.cpu()[0],
            TEMP_AUDIO_DIR,
            recon_stem,
            44100,
            audio_format
        )
    
    # Optionally show original audio (2-minute cropped mono)
    if show_original_audio and speaker_audio_path:
        # Save original audio with same format (unique filename per session)
        original_stem = make_stem("original_audio", session_id)
        original_output_path = save_audio_with_format(
            speaker_audio.cpu(),
            TEMP_AUDIO_DIR,
            original_stem,
            44100,
            audio_format
        )
    
    # Return results with visibility control for accordions
    show_reference_section = (show_original_audio or reconstruct_first_30_seconds) and speaker_audio_path is not None
    return (
        gr.update(),
        gr.update(value=str(output_path), visible=True),
        gr.update(value=text_display, visible=True),
        gr.update(value=str(original_output_path) if original_output_path else None, visible=True),
        gr.update(value=time_str, visible=True),
        gr.update(value=str(recon_output_path) if recon_output_path else None, visible=True),
        gr.update(visible=(show_original_audio and speaker_audio_path is not None)),  # original_accordion visibility
        gr.update(visible=(reconstruct_first_30_seconds and speaker_audio_path is not None)),  # reference_accordion visibility
        gr.update(visible=show_reference_section)  # reference_audio_header visibility
    )


# UI Helper Functions

def load_speaker_metadata(speaker_id):
    """Load metadata for a speaker from any of their voice folders."""
    if not EARS_PATH.exists():
        return None
    
    # Find any subfolder for this speaker and load its metadata
    for subdir in EARS_PATH.iterdir():
        if subdir.is_dir() and subdir.name.startswith(f"{speaker_id}_"):
            metadata_path = subdir / "metadata.json"
            if metadata_path.exists():
                try:
                    with open(metadata_path, 'r') as f:
                        data = json.load(f)
                        return data.get("speaker_metadata", {})
                except Exception:
                    continue
    return None


def get_speakers():
    """Get list of unique speakers with their metadata."""
    if not EARS_PATH.exists():
        return []
    
    speakers_dict = {}
    for subdir in sorted(EARS_PATH.iterdir()):
        if subdir.is_dir():
            # Extract speaker ID (pXXX)
            name = subdir.name
            if name.startswith('p') and '_' in name:
                speaker_id = name.split('_')[0]
                if speaker_id not in speakers_dict:
                    speakers_dict[speaker_id] = None
    
    # Load metadata for each speaker
    speakers_with_metadata = []
    for speaker_id in sorted(speakers_dict.keys()):
        metadata = load_speaker_metadata(speaker_id)
        if metadata:
            speakers_with_metadata.append({
                'id': speaker_id,
                'gender': metadata.get('gender', 'unknown'),
                'age': metadata.get('age', 'unknown'),
                'ethnicity': metadata.get('ethnicity', 'unknown'),
                'native_language': metadata.get('native language', 'unknown'),
            })
        else:
            speakers_with_metadata.append({
                'id': speaker_id,
                'gender': 'unknown',
                'age': 'unknown',
                'ethnicity': 'unknown',
                'native_language': 'unknown',
            })
    
    return speakers_with_metadata


def get_speakers_table(search_query=""):
    """Get speakers as table data for Gradio, optionally filtered by search query."""
    speakers = get_speakers()
    result = []
    for s in speakers:
        # Abbreviate gender
        gender = s['gender']
        if gender.lower() == 'male':
            gender = 'M'
        elif gender.lower() == 'female':
            gender = 'F'
        else:
            gender = gender[0].upper() if gender else '?'
        
        # Apply search filter if provided
        if search_query:
            search_lower = search_query.lower()
            searchable_text = f"{s['id']} {gender} {s['age']} {s['ethnicity']} {s['native_language']}".lower()
            if search_lower not in searchable_text:
                continue
        
        result.append([s['id'], gender, s['age'], s['ethnicity'], s['native_language']])
    return result


def get_audio_length_from_metadata(voice_dir):
    """Get audio length from metadata.json file."""
    metadata_path = voice_dir / "metadata.json"
    if metadata_path.exists():
        try:
            with open(metadata_path, 'r') as f:
                data = json.load(f)
                length = data.get("audio_length_seconds", 0)
                return f"{length:.1f}s"
        except Exception:
            return "N/A"
    return "N/A"


def get_freeform_table(speaker_id):
    """Get freeform table for a speaker (single row if exists)."""
    if not EARS_PATH.exists() or not speaker_id:
        return []
    
    freeform_dir = EARS_PATH / f"{speaker_id}_freeform"
    if freeform_dir.exists():
        audio_path = freeform_dir / "audio.mp3"
        st_path = freeform_dir / "speaker_latent.safetensors"
        if audio_path.exists() and st_path.exists():
            audio_length = get_audio_length_from_metadata(freeform_dir)
            return [["Freeform", audio_length]]
    return []


def get_emotions_for_speaker(speaker_id):
    """Get list of emotions with audio lengths available for a given speaker (excluding _joint_)."""
    if not EARS_PATH.exists() or not speaker_id:
        return []
    emotions = []
    for subdir in sorted(EARS_PATH.iterdir()):
        if subdir.is_dir():
            name = subdir.name
            # Match pattern: p{speaker_id}_emo_{emotion} (but not _emo_joint_)
            if name.startswith(f"{speaker_id}_emo_") and "_joint_" not in name:
                # Extract emotion part
                parts = name.split('_emo_')
                if len(parts) == 2:
                    emotion = parts[1]
                    # Verify files exist
                    audio_path = subdir / "audio.mp3"
                    st_path = subdir / "speaker_latent.safetensors"
                    if audio_path.exists() and st_path.exists():
                        audio_length = get_audio_length_from_metadata(subdir)
                        emotions.append((emotion, audio_length))
    return emotions


def get_emotions_table(speaker_id):
    """Get emotions table for a speaker with audio lengths."""
    if not speaker_id:
        return []
    emotions = get_emotions_for_speaker(speaker_id)
    return [[emotion, length] for emotion, length in emotions]


# VCTK Helper Functions

def get_vctk_speakers():
    """Get list of VCTK speakers with their metadata."""
    if not VCTK_PATH.exists():
        return []
    
    speakers_with_metadata = []
    for subdir in sorted(VCTK_PATH.iterdir()):
        if subdir.is_dir() and subdir.name.startswith('p'):
            speaker_id = subdir.name
            audio_path = subdir / "audio.mp3"
            st_path = subdir / "speaker_latent.safetensors"
            metadata_path = subdir / "metadata.json"
            
            if audio_path.exists() and st_path.exists() and metadata_path.exists():
                try:
                    with open(metadata_path, 'r') as f:
                        data = json.load(f)
                        speaker_info = data.get("speaker_info", {})
                        audio_length = data.get("total_audio_length_seconds", 0)
                        
                        speakers_with_metadata.append({
                            'id': speaker_info.get('id', speaker_id),
                            'gender': speaker_info.get('gender', 'unknown'),
                            'age': speaker_info.get('age', 'unknown'),
                            'details': speaker_info.get('details', 'unknown'),
                            'audio_length': f"{audio_length:.1f}s"
                        })
                except Exception:
                    continue
    
    return speakers_with_metadata


def get_vctk_speakers_table(search_query=""):
    """Get VCTK speakers as table data for Gradio, optionally filtered by search query."""
    speakers = get_vctk_speakers()
    result = []
    for s in speakers:
        # Abbreviate gender
        gender = s['gender']
        if gender.lower() == 'male' or gender == 'M':
            gender = 'M'
        elif gender.lower() == 'female' or gender == 'F':
            gender = 'F'
        else:
            gender = gender[0].upper() if gender else '?'
        
        # Apply search filter if provided
        if search_query:
            search_lower = search_query.lower()
            searchable_text = f"{s['id']} {gender} {s['age']} {s['details']} {s['audio_length']}".lower()
            if search_lower not in searchable_text:
                continue
        
        result.append([s['id'], gender, s['age'], s['details'], s['audio_length']])
    return result


def load_text_presets():
    """Load text presets from file with category and word count."""
    if TEXT_PRESETS_PATH.exists():
        with open(TEXT_PRESETS_PATH, 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f if line.strip()]
        
        result = []
        for line in lines:
            # Split on first " | " to separate category from text
            if " | " in line:
                parts = line.split(" | ", 1)
                category = parts[0]
                text = parts[1]
            else:
                # Fallback if no category
                category = "Uncategorized"
                text = line
            
            # Calculate word count
            word_count = len(text.split())
            
            result.append([category, str(word_count), text])
        
        return result
    return []


def search_speakers(search_query):
    """Filter speakers table based on search query."""
    filtered_data = get_speakers_table(search_query)
    return gr.update(value=filtered_data)


def select_speaker_from_table(evt: gr.SelectData, table_data):
    """Handle speaker selection - populate freeform and emotions tables."""
    if evt.value and table_data is not None:
        # evt.index is a tuple/list (row, col), we need the row to get the speaker ID
        if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2:
            row_index = evt.index[0]
        else:
            row_index = evt.index
        
        # Use the actual displayed (filtered) table data (pandas DataFrame)
        if isinstance(row_index, int) and row_index < len(table_data):
            speaker_row = table_data.iloc[row_index]
            speaker_id = speaker_row.iloc[0]  # First column is the ID
            
            # Format selection display - clean and simple
            gender_full = "Male" if speaker_row.iloc[1] == "M" else "Female" if speaker_row.iloc[1] == "F" else speaker_row.iloc[1]
            selection_text = f"Selected Speaker: {speaker_id}\n{gender_full} • {speaker_row.iloc[2]} • {speaker_row.iloc[3]}"
            
            # Get freeform and emotions data
            freeform_data = get_freeform_table(speaker_id)
            emotions_data = get_emotions_table(speaker_id)
            
            return (
                gr.update(value=selection_text, visible=True),  # Show speaker selection
                gr.update(value=freeform_data, visible=True),  # Update freeform table
                gr.update(value=emotions_data, visible=True),  # Update emotions table
                gr.update(value=speaker_id),  # Store speaker ID
                gr.update(value=None),  # Clear audio preview
                gr.update(value=""),  # Clear safetensors path
                gr.update(value=""),   # Clear audio path
                gr.update(value="", visible=False)  # Clear voice selection display
            )
    return (
        gr.update(value="", visible=False),
        gr.update(value=[], visible=True),
        gr.update(value=[], visible=True),
        gr.update(value=""),
        gr.update(value=None),
        gr.update(value=""),
        gr.update(value=""),
        gr.update(value="", visible=False)
    )


def select_freeform_from_table(evt: gr.SelectData, speaker_id: str):
    """Handle freeform selection from table - load freeform voice files."""
    if speaker_id:
        voice_name = f"{speaker_id}_freeform"
        voice_dir = EARS_PATH / voice_name
        audio_path = str(voice_dir / "audio.mp3")
        st_path = str(voice_dir / "speaker_latent.safetensors")
        
        if voice_dir.exists():
            # Format freeform display
            freeform_display = f"Selected: Freeform\n{speaker_id}_freeform"
            
            return (
                gr.update(value=freeform_display, visible=True),  # Show freeform selection
                gr.update(value=audio_path),  # Update audio player
                gr.update(value=st_path),      # Update safetensors path
                gr.update(value=audio_path)    # Update audio path for reconstruction
            )
    return gr.update(value="", visible=False), gr.update(value=None), gr.update(value=""), gr.update(value="")


def select_emotion_from_table(evt: gr.SelectData, speaker_id: str):
    """Handle emotion selection - load voice files."""
    if evt.value and speaker_id:
        # evt.index is (row, col) - get the row to extract emotion from first column
        if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2:
            row_index = evt.index[0]
        else:
            row_index = 0
        
        # Get emotions data and extract the emotion name from first column
        emotions_data = get_emotions_table(speaker_id)
        if isinstance(row_index, int) and row_index < len(emotions_data):
            emotion = emotions_data[row_index][0]  # First column is emotion name
            
            voice_name = f"{speaker_id}_emo_{emotion}"
            voice_dir = EARS_PATH / voice_name
            audio_path = str(voice_dir / "audio.mp3")
            st_path = str(voice_dir / "speaker_latent.safetensors")
            
            if voice_dir.exists():
                # Format emotion display - clean and simple
                emotion_display = f"Selected Emotion: {emotion.title()}\n{speaker_id}_emo_{emotion}"
                
                return (
                    gr.update(value=emotion_display, visible=True),  # Show emotion selection
                    gr.update(value=audio_path),  # Update audio player
                    gr.update(value=st_path),      # Update safetensors path
                    gr.update(value=audio_path)    # Update audio path for reconstruction
                )
    return gr.update(value="", visible=False), gr.update(value=None), gr.update(value=""), gr.update(value="")


def select_vctk_speaker_from_table(evt: gr.SelectData, table_data):
    """Handle VCTK speaker selection - load voice files directly."""
    if evt.value and table_data is not None:
        # evt.index is a tuple/list (row, col), we need the row to get the speaker ID
        if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2:
            row_index = evt.index[0]
        else:
            row_index = evt.index
        
        # Use the actual displayed (filtered) table data (pandas DataFrame)
        if isinstance(row_index, int) and row_index < len(table_data):
            speaker_row = table_data.iloc[row_index]
            speaker_id = speaker_row.iloc[0]  # First column is the ID
            
            # Load voice files from VCTK
            voice_dir = VCTK_PATH / speaker_id
            audio_path = str(voice_dir / "audio.mp3")
            st_path = str(voice_dir / "speaker_latent.safetensors")
            
            if voice_dir.exists():
                # Format selection display
                gender_full = "Male" if speaker_row.iloc[1] == "M" else "Female" if speaker_row.iloc[1] == "F" else speaker_row.iloc[1]
                selection_text = f"Selected Speaker: {speaker_id}\n{gender_full} • {speaker_row.iloc[2]} • {speaker_row.iloc[3]}"
                
                return (
                    gr.update(value=selection_text, visible=True),  # Show speaker selection
                    gr.update(value=speaker_id),  # Store speaker ID
                    gr.update(value=audio_path),  # Update audio player
                    gr.update(value=st_path),      # Update safetensors path
                    gr.update(value=audio_path)    # Update audio path for reconstruction
                )
    return (
        gr.update(value="", visible=False),
        gr.update(value=""),
        gr.update(value=None),
        gr.update(value=""),
        gr.update(value="")
    )


def search_vctk_speakers(search_query):
    """Filter VCTK speakers table based on search query."""
    filtered_data = get_vctk_speakers_table(search_query)
    return gr.update(value=filtered_data)


# Expresso Helper Functions

def get_expresso_speakers():
    """Get list of all Expresso speakers with their metadata."""
    if not EXPRESSO_PATH.exists():
        return []
    
    speakers_with_metadata = []
    for subdir in sorted(EXPRESSO_PATH.iterdir()):
        if subdir.is_dir() and subdir.name.startswith('expresso_'):
            speaker_id = subdir.name
            audio_path = subdir / "audio.mp3"
            st_path = subdir / "speaker_latent.safetensors"
            metadata_path = subdir / "metadata.json"
            
            if audio_path.exists() and st_path.exists() and metadata_path.exists():
                try:
                    with open(metadata_path, 'r') as f:
                        data = json.load(f)
                        audio_length = data.get("audio_length_seconds", 0)
                        
                        speakers_with_metadata.append({
                            'id': speaker_id,
                            'type': data.get('type', 'unknown'),
                            'speakers': data.get('speakers', 'unknown'),
                            'style': data.get('style', 'unknown'),
                            'audio_length': f"{audio_length:.1f}s"
                        })
                except Exception:
                    continue
    
    return speakers_with_metadata


def get_expresso_speakers_table(search_query=""):
    """Get Expresso speakers as table data for Gradio, optionally filtered by search query."""
    speakers = get_expresso_speakers()
    result = []
    for s in speakers:
        # Apply search filter if provided
        if search_query:
            search_lower = search_query.lower()
            # Search in all fields
            if not any(search_lower in str(v).lower() for v in [s['id'], s['type'], s['speakers'], s['style']]):
                continue
        
        result.append([
            s['id'],
            s['type'],
            s['speakers'],
            s['style'],
            s['audio_length']
        ])
    
    return result


def select_expresso_speaker_from_table(evt: gr.SelectData, table_data):
    """Handle Expresso speaker selection - load voice files directly."""
    if evt.value and table_data is not None:
        # evt.index is a tuple/list (row, col), we need the row to get the speaker ID
        if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2:
            row_index = evt.index[0]
        else:
            row_index = evt.index
        
        # Use the actual displayed (filtered) table data (pandas DataFrame)
        if isinstance(row_index, int) and row_index < len(table_data):
            speaker_row = table_data.iloc[row_index]
            speaker_id = speaker_row.iloc[0]  # First column is the ID
            
            # Load voice files from Expresso
            voice_dir = EXPRESSO_PATH / speaker_id
            audio_path = str(voice_dir / "audio.mp3")
            st_path = str(voice_dir / "speaker_latent.safetensors")
            
            if voice_dir.exists():
                # Format selection display
                selection_text = f"Selected Voice: {speaker_id}\nType: {speaker_row.iloc[1]} • Speakers: {speaker_row.iloc[2]} • Style: {speaker_row.iloc[3]}"
                
                return (
                    gr.update(value=selection_text, visible=True),  # Show speaker selection
                    gr.update(value=speaker_id),  # Store speaker ID
                    gr.update(value=audio_path),  # Update audio player
                    gr.update(value=st_path),      # Update safetensors path
                    gr.update(value=audio_path)    # Update audio path for reconstruction
                )
    return (
        gr.update(value="", visible=False),
        gr.update(value=""),
        gr.update(value=None),
        gr.update(value=""),
        gr.update(value="")
    )


def search_expresso_speakers(search_query):
    """Filter Expresso speakers table based on search query."""
    filtered_data = get_expresso_speakers_table(search_query)
    return gr.update(value=filtered_data)


# HF-Custom Helper Functions

def get_hf_custom_speakers():
    """Get list of all HF-Custom speakers with their metadata."""
    if not HF_CUSTOM_PATH.exists():
        return []
    
    speakers_with_metadata = []
    for subdir in sorted(HF_CUSTOM_PATH.iterdir()):
        if subdir.is_dir():
            speaker_name = subdir.name
            audio_path = subdir / "audio.mp3"
            st_path = subdir / "speaker_latent.safetensors"
            metadata_path = subdir / "metadata.json"
            
            if audio_path.exists() and st_path.exists() and metadata_path.exists():
                try:
                    with open(metadata_path, 'r') as f:
                        data = json.load(f)
                        audio_length = data.get("audio_duration_seconds", 0)
                        
                        speakers_with_metadata.append({
                            'name': data.get('speaker_name', speaker_name),
                            'dataset': data.get('dataset_name', ''),
                            'description': data.get('speaker_description', ''),
                            'audio_length': f"{audio_length:.1f}s"
                        })
                except Exception:
                    continue
    
    return speakers_with_metadata


def get_hf_custom_speakers_table(search_query=""):
    """Get HF-Custom speakers as table data for Gradio, optionally filtered by search query."""
    speakers = get_hf_custom_speakers()
    result = []
    for s in speakers:
        # Apply search filter if provided
        if search_query:
            search_lower = search_query.lower()
            # Search in all fields
            if not any(search_lower in str(v).lower() for v in [s['name'], s['dataset'], s['description']]):
                continue
        
        result.append([
            s['name'],
            s['dataset'],
            s['description'],
            s['audio_length']
        ])
    
    return result


def select_hf_custom_speaker_from_table(evt: gr.SelectData, table_data):
    """Handle HF-Custom speaker selection - load voice files directly."""
    if evt.value and table_data is not None:
        # evt.index is a tuple/list (row, col), we need the row to get the speaker name
        if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2:
            row_index = evt.index[0]
        else:
            row_index = evt.index
        
        # Use the actual displayed (filtered) table data (pandas DataFrame)
        if isinstance(row_index, int) and row_index < len(table_data):
            speaker_row = table_data.iloc[row_index]
            speaker_name = speaker_row.iloc[0]  # First column is the name
            
            # Load voice files from HF-Custom
            voice_dir = HF_CUSTOM_PATH / speaker_name
            audio_path = str(voice_dir / "audio.mp3")
            st_path = str(voice_dir / "speaker_latent.safetensors")
            
            if voice_dir.exists():
                # Format selection display
                dataset_info = f" • {speaker_row.iloc[1]}" if speaker_row.iloc[1] else ""
                selection_text = f"Selected Voice: {speaker_name}{dataset_info}\n{speaker_row.iloc[2]}"
                
                return (
                    gr.update(value=selection_text, visible=True),  # Show speaker selection
                    gr.update(value=speaker_name),  # Store speaker name
                    gr.update(value=audio_path),  # Update audio player
                    gr.update(value=st_path),      # Update safetensors path
                    gr.update(value=audio_path)    # Update audio path for reconstruction
                )
    return (
        gr.update(value="", visible=False),
        gr.update(value=""),
        gr.update(value=None),
        gr.update(value=""),
        gr.update(value="")
    )


def search_hf_custom_speakers(search_query):
    """Filter HF-Custom speakers table based on search query."""
    filtered_data = get_hf_custom_speakers_table(search_query)
    return gr.update(value=filtered_data)


# Audio Prompt Library functions
AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".ogg", ".flac", ".webm", ".aac", ".opus"}

def get_audio_prompt_files():
    """Get list of audio files from the audio prompt folder."""
    if AUDIO_PROMPT_FOLDER is None or not AUDIO_PROMPT_FOLDER.exists():
        return []
    
    files = sorted([
        f.name for f in AUDIO_PROMPT_FOLDER.iterdir() 
        if f.is_file() and f.suffix.lower() in AUDIO_EXTS
    ], key=str.lower)
    
    return [[file] for file in files]


def select_audio_prompt_file(evt: gr.SelectData):
    """Handle audio prompt file selection from table."""
    if evt.value and AUDIO_PROMPT_FOLDER is not None:
        file_path = AUDIO_PROMPT_FOLDER / evt.value
        if file_path.exists():
            return gr.update(value=str(file_path))
    return gr.update()


def switch_dataset(dataset_name):
    """Switch between Custom Audio Panel, EARS, VCTK, Expresso, and HF-Custom datasets."""
    if dataset_name == "Custom Audio Panel":
        # Show Custom Audio Panel only, hide all voicebank UI
        return (
            gr.update(value="", visible=False),  # dataset_license_info
            gr.update(visible=True),  # custom_audio_row
            gr.update(visible=False),  # voicebank_row
            gr.update(visible=False),  # voice_type_column
            gr.update(visible=True),  # ears_column (within voicebank_row)
            gr.update(visible=False),  # vctk_column
            gr.update(visible=False),  # expresso_column
            gr.update(visible=False),  # hf_custom_column
            # Clear selections
            gr.update(value="", visible=False),  # selected_speaker_display
            gr.update(value=[]),  # freeform_table
            gr.update(value=[]),  # emotions_table
            gr.update(value="", visible=False),  # selected_voice_display
            gr.update(value="", visible=False),  # vctk_speaker_display
            gr.update(value="", visible=False),  # expresso_speaker_display
            gr.update(value="", visible=False),  # hf_custom_speaker_display
            gr.update(value=""),  # selected_speaker_state
            gr.update(value=None),  # audio_preview
            gr.update(value=""),  # speaker_st_path_state
            gr.update(value="")  # speaker_audio_path_state
        )
    elif dataset_name == "EARS":
        # Show EARS UI, hide others, show Voice Type column
        license_text = "**EARS Dataset License:** Creative Commons Attribution 4.0 International ([CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/))"
        return (
            gr.update(value=license_text, visible=True),  # dataset_license_info
            gr.update(visible=False),  # custom_audio_row
            gr.update(visible=True),  # voicebank_row
            gr.update(visible=True),  # voice_type_column (show for EARS)
            gr.update(visible=True),  # ears_column
            gr.update(visible=False),  # vctk_column
            gr.update(visible=False),  # expresso_column
            gr.update(visible=False),  # hf_custom_column
            gr.update(value=""),  # selected_speaker_display
            gr.update(value=[], visible=True),  # freeform_table
            gr.update(value=[], visible=True),  # emotions_table
            gr.update(value="", visible=False),  # selected_voice_display
            gr.update(value="", visible=False),  # vctk_speaker_display
            gr.update(value="", visible=False),  # expresso_speaker_display
            gr.update(value="", visible=False),  # hf_custom_speaker_display
            gr.update(value=""),  # selected_speaker_state
            gr.update(value=None),  # audio_preview
            gr.update(value=""),  # speaker_st_path_state
            gr.update(value="")   # speaker_audio_path_state
        )
    elif dataset_name == "VCTK":
        # Show VCTK UI, hide others, hide Voice Type column
        license_text = "**VCTK Dataset License:** Creative Commons Attribution 4.0 International ([CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/))"
        return (
            gr.update(value=license_text, visible=True),  # dataset_license_info
            gr.update(visible=False),  # custom_audio_row
            gr.update(visible=True),  # voicebank_row
            gr.update(visible=False),  # voice_type_column
            gr.update(visible=False),  # ears_column
            gr.update(visible=True),  # vctk_column
            gr.update(visible=False),  # expresso_column
            gr.update(visible=False),  # hf_custom_column (hide for VCTK)
            gr.update(value=""),  # selected_speaker_display
            gr.update(value=[], visible=True),  # freeform_table
            gr.update(value=[], visible=True),  # emotions_table
            gr.update(value="", visible=False),  # selected_voice_display
            gr.update(value="", visible=False),  # vctk_speaker_display
            gr.update(value="", visible=False),  # expresso_speaker_display
            gr.update(value="", visible=False),  # hf_custom_speaker_display
            gr.update(value=""),  # selected_speaker_state
            gr.update(value=None),  # audio_preview
            gr.update(value=""),  # speaker_st_path_state
            gr.update(value="")   # speaker_audio_path_state
        )
    elif dataset_name == "Expresso":
        # Show Expresso UI, hide others, hide Voice Type column
        license_text = "**Expresso Dataset License:** Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International ([CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/))"
        return (
            gr.update(value=license_text, visible=True),  # dataset_license_info
            gr.update(visible=False),  # custom_audio_row
            gr.update(visible=True),  # voicebank_row
            gr.update(visible=False),  # voice_type_column
            gr.update(visible=False),  # ears_column
            gr.update(visible=False),  # vctk_column
            gr.update(visible=True),  # expresso_column
            gr.update(visible=False),  # hf_custom_column (hide for Expresso)
            gr.update(value=""),  # selected_speaker_display
            gr.update(value=[], visible=True),  # freeform_table
            gr.update(value=[], visible=True),  # emotions_table
            gr.update(value="", visible=False),  # selected_voice_display
            gr.update(value="", visible=False),  # vctk_speaker_display
            gr.update(value="", visible=False),  # expresso_speaker_display
            gr.update(value="", visible=False),  # hf_custom_speaker_display
            gr.update(value=""),  # selected_speaker_state
            gr.update(value=None),  # audio_preview
            gr.update(value=""),  # speaker_st_path_state
            gr.update(value="")   # speaker_audio_path_state
        )
    else:  # HF-Custom
        # Show HF-Custom UI, hide others, hide Voice Type column
        license_text = "**HF-Custom Voices:** Available in dataset cache (information in metadata.json per voice). Also view dataset at [jordand/echo-embeddings-custom](https://huggingface.co/datasets/jordand/echo-embeddings-custom)"
        return (
            gr.update(value=license_text, visible=True),  # dataset_license_info
            gr.update(visible=False),  # custom_audio_row
            gr.update(visible=True),  # voicebank_row
            gr.update(visible=False),  # voice_type_column
            gr.update(visible=False),  # ears_column
            gr.update(visible=False),  # vctk_column
            gr.update(visible=False),  # expresso_column
            gr.update(visible=True),  # hf_custom_column
            gr.update(value=""),  # selected_speaker_display
            gr.update(value=[], visible=True),  # freeform_table
            gr.update(value=[], visible=True),  # emotions_table
            gr.update(value="", visible=False),  # selected_voice_display
            gr.update(value="", visible=False),  # vctk_speaker_display
            gr.update(value="", visible=False),  # expresso_speaker_display
            gr.update(value="", visible=False),  # hf_custom_speaker_display
            gr.update(value=""),  # selected_speaker_state
            gr.update(value=None),  # audio_preview
            gr.update(value=""),  # speaker_st_path_state
            gr.update(value="")   # speaker_audio_path_state
        )


def select_text_preset(evt: gr.SelectData):
    """Handle text preset selection - extract text from the row."""
    if evt.value:
        # Get the row index from the selected cell
        if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2:
            row_index = evt.index[0]
        else:
            row_index = evt.index
        
        # Get all presets and extract the text (column 2) from the selected row
        presets_data = load_text_presets()
        if isinstance(row_index, int) and row_index < len(presets_data):
            text = presets_data[row_index][2]  # Column 2 is the text
            return gr.update(value=text)
    return gr.update()


def update_cfg_visibility(cfg_mode):
    """Update visibility of CFG parameters based on selected mode."""
    if cfg_mode == "joint-unconditional":
        return (
            gr.update(label="Text/Speaker CFG Scale", info="Guidance strength for text and speaker (joint)"),
            gr.update(visible=False),
            gr.update(visible=False)
        )
    elif cfg_mode == "apg-independent":
        return (
            gr.update(label="Text CFG Scale", info="Guidance strength for text"),
            gr.update(visible=True),
            gr.update(visible=True)
        )
    else:  # independent or alternating
        return (
            gr.update(label="Text CFG Scale", info="Guidance strength for text"),
            gr.update(visible=True),
            gr.update(visible=False)
        )


def toggle_speaker_k_fields(enabled):
    """Toggle visibility of speaker K row. Hidden components preserve their values automatically."""
    return gr.update(visible=enabled)


def toggle_custom_shapes_fields(enabled):
    """Toggle visibility of custom shapes row and reset to defaults if disabled."""
    if enabled:
        return gr.update(visible=True)
    else:
        # When disabled, hide the row and reset fields to defaults
        return gr.update(visible=False)


def toggle_mode(mode, speaker_k_enable_val, speaker_kv_simple_val):
    """Toggle between simple and advanced modes and sync speaker KV state."""
    if mode == "Simple Mode":
        # Sync simple checkbox with advanced mode's speaker_k_enable value
        return (
            gr.update(visible=True),  # simple_mode_row (speaker KV checkbox)
            gr.update(visible=False),  # advanced_mode_compile_column
            gr.update(visible=False),  # advanced_mode_column (all other parameters)
            gr.update(value=speaker_k_enable_val),  # sync simple checkbox with advanced
            gr.update(value=speaker_k_enable_val),  # also update speaker_k_enable (keep same)
        )
    else:  # Advanced Mode
        # Sync advanced mode's speaker_k_enable with simple checkbox value
        return (
            gr.update(visible=False),  # simple_mode_row (speaker KV checkbox)
            gr.update(visible=True),  # advanced_mode_compile_column
            gr.update(visible=True),  # advanced_mode_column (all other parameters)
            gr.update(value=speaker_kv_simple_val),  # sync simple checkbox (keep same)
            gr.update(value=speaker_kv_simple_val),  # sync advanced with simple checkbox
        )


def sync_simple_to_advanced(simple_enabled):
    """Sync simple mode speaker KV checkbox to advanced mode controls."""
    if simple_enabled:
        return (
            gr.update(value=True),  # speaker_k_enable
            gr.update(visible=True),  # speaker_k_row
            gr.update(value=1.5),  # speaker_k_scale
            gr.update(value=0.9),  # speaker_k_min_t
            gr.update(value=24),  # speaker_k_max_layers
        )
    else:
        return (
            gr.update(value=False),  # speaker_k_enable
            gr.update(visible=False),  # speaker_k_row
            gr.update(),  # speaker_k_scale (no change)
            gr.update(),  # speaker_k_min_t (no change)
            gr.update(),  # speaker_k_max_layers (no change)
        )


def apply_core_preset(preset_name):
    """Apply core sampling parameters preset."""
    if preset_name == "default":
        return [
            gr.update(value=0),  # rng_seed
            gr.update(value=40),  # num_steps
            gr.update(value="independent"),  # cfg_mode
            gr.update(value="Custom"),  # Set main preset to Custom
        ]
    return [gr.update()] * 4


def apply_cfg_preset(preset_name):
    """Apply CFG guidance preset."""
    presets = {
        "default": (3.0, 5.0, 0.5, 1.0),
        "higher speaker": (3.0, 8.0, 0.5, 1.0),
        "large guidances": (8.0, 8.0, 0.5, 1.0),
    }
    
    if preset_name not in presets:
        return [gr.update()] * 5
    
    text_scale, speaker_scale, min_t, max_t = presets[preset_name]
    
    return [
        gr.update(value=text_scale),  # cfg_scale_text
        gr.update(value=speaker_scale),  # cfg_scale_speaker
        gr.update(value=min_t),  # cfg_min_t
        gr.update(value=max_t),  # cfg_max_t
        gr.update(value="Custom"),  # Set main preset to Custom
    ]


def apply_speaker_kv_preset(preset_name):
    """Apply speaker KV attention control preset."""
    if preset_name == "enable":
        return [
            gr.update(value=True),  # speaker_k_enable
            gr.update(visible=True),  # speaker_k_row
            gr.update(value="Custom"),  # Set main preset to Custom
        ]
    elif preset_name == "off":
        return [
            gr.update(value=False),  # speaker_k_enable
            gr.update(visible=False),  # speaker_k_row
            gr.update(value="Custom"),  # Set main preset to Custom
        ]
    return [gr.update()] * 3


def apply_truncation_preset(preset_name):
    """Apply truncation & temporal rescaling preset."""
    presets = {
        "flat": (0.8, 1.2, 3.0),
        "sharp": (0.9, 0.96, 3.0),
        "baseline(sharp)": (1.0, 1.0, 3.0),
    }
    
    if preset_name == "custom" or preset_name not in presets:
        return [gr.update()] * 4  # Return no changes for custom
    
    truncation, rescale_k, rescale_sigma = presets[preset_name]
    
    return [
        gr.update(value=truncation),
        gr.update(value=rescale_k),
        gr.update(value=rescale_sigma),
        gr.update(value="Custom"),  # Set main preset to Custom
    ]


def apply_apg_preset(preset_name):
    """Apply APG parameters preset."""
    presets = {
        "default": (0.5, 0.5, -0.25, -0.25, "", ""),  # default: -0.25 momentum
        "no momentum": (0.0, 0.0, 0.0, 0.0, "", ""),  # no momentum: 0 momentum
        "norms": (0.5, 0.5, -0.25, -0.25, "7.5", "7.5"),  # norms: default + 7.5 norms
        "no eta": (0.0, 0.0, -0.25, -0.25, "", ""),  # no eta: 0 eta
    }
    
    if preset_name not in presets:
        return [gr.update()] * 7
    
    eta_text, eta_speaker, momentum_text, momentum_speaker, norm_text, norm_speaker = presets[preset_name]
    
    return [
        gr.update(value=eta_text),  # apg_eta_text
        gr.update(value=eta_speaker),  # apg_eta_speaker
        gr.update(value=momentum_text),  # apg_momentum_text
        gr.update(value=momentum_speaker),  # apg_momentum_speaker
        gr.update(value=norm_text),  # apg_norm_text
        gr.update(value=norm_speaker),  # apg_norm_speaker
        gr.update(value="Custom"),  # Set main preset to Custom
    ]


def load_sampler_presets():
    """Load sampler presets from JSON file."""
    if SAMPLER_PRESETS_PATH.exists():
        with open(SAMPLER_PRESETS_PATH, 'r') as f:
            return json.load(f)
    else:
        # Create default presets (will use existing JSON file if it exists)
        default_presets = {
            "Flat (Independent)": {
                "num_steps": "30",
                "cfg_mode": "independent",
                "cfg_scale_text": "3.0",
                "cfg_scale_speaker": "5.0",
                "cfg_min_t": "0.5",
                "cfg_max_t": "1.0",
                "truncation_factor": "0.8",
                "rescale_k": "1.2",
                "rescale_sigma": "3.0"
            },
            "Sharp (Independent)": {
                "num_steps": "30",
                "cfg_mode": "independent",
                "cfg_scale_text": "3.0",
                "cfg_scale_speaker": "5.0",
                "cfg_min_t": "0.5",
                "cfg_max_t": "1.0",
                "truncation_factor": "0.9",
                "rescale_k": "0.96",
                "rescale_sigma": "3.0"
            },
        }
        with open(SAMPLER_PRESETS_PATH, 'w') as f:
            json.dump(default_presets, f, indent=2)
        return default_presets


def apply_sampler_preset(preset_name):
    """Apply a sampler preset to all fields."""
    presets = load_sampler_presets()
    if preset_name == "Custom" or preset_name not in presets:
        return [gr.update()] * 20  # Return no changes for custom
    
    preset = presets[preset_name]
    
    # Determine visibility based on cfg_mode
    cfg_mode_value = preset["cfg_mode"]
    speaker_visible = (cfg_mode_value != "joint-unconditional")
    apg_visible = (cfg_mode_value == "apg-independent")
    
    speaker_k_enabled = preset.get("speaker_k_enable", False)
    
    # Convert string values to numeric where appropriate
    def to_num(val, default):
        try:
            return float(val) if isinstance(val, str) else val
        except (ValueError, TypeError):
            return default
    
    return [
        gr.update(value=int(to_num(preset["num_steps"], 40))),
        gr.update(value=preset["cfg_mode"]),
        gr.update(value=to_num(preset["cfg_scale_text"], 3.0)),
        gr.update(value=to_num(preset["cfg_scale_speaker"], 5.0), visible=speaker_visible),
        gr.update(value=to_num(preset["cfg_min_t"], 0.5)),
        gr.update(value=to_num(preset["cfg_max_t"], 1.0)),
        gr.update(value=to_num(preset["truncation_factor"], 0.8)),
        gr.update(value=to_num(preset["rescale_k"], 1.2)),  # Now numeric
        gr.update(value=to_num(preset["rescale_sigma"], 3.0)),
        gr.update(value=speaker_k_enabled),
        gr.update(visible=speaker_k_enabled),  # speaker_k_row
        gr.update(value=to_num(preset.get("speaker_k_scale", "1.5"), 1.5)),
        gr.update(value=to_num(preset.get("speaker_k_min_t", "0.9"), 0.9)),
        gr.update(value=int(to_num(preset.get("speaker_k_max_layers", "24"), 24))),
        gr.update(value=to_num(preset.get("apg_eta_text", "0.0"), 0.0)),
        gr.update(value=to_num(preset.get("apg_eta_speaker", "0.0"), 0.0)),
        gr.update(value=to_num(preset.get("apg_momentum_text", "0.0"), 0.0)),
        gr.update(value=to_num(preset.get("apg_momentum_speaker", "0.0"), 0.0)),
        gr.update(value=preset.get("apg_norm_text", "")),  # Keep as string (can be empty)
        gr.update(value=preset.get("apg_norm_speaker", "")),  # Keep as string (can be empty)
    ]


# Build Gradio Interface
LINK_CSS = """
.preset-inline { display:flex; align-items:baseline; gap:6px; margin-top:-4px; margin-bottom:-12px; }
.preset-inline .title { font-weight:600; font-size:.95rem; }
.preset-inline .dim   { color:#666; margin:0 4px; }
/* blue, linky */
a.preset-link { color: #0a5bd8; text-decoration: underline; cursor: pointer; font-weight: 400; }
a.preset-link:hover { text-decoration: none; opacity: 0.8; }

/* Dark mode support for preset links */
.dark a.preset-link,
[data-theme="dark"] a.preset-link {
    color: #60a5fa !important;
}
.dark a.preset-link:hover,
[data-theme="dark"] a.preset-link:hover {
    color: #93c5fd !important;
}
.dark .preset-inline .dim,
[data-theme="dark"] .preset-inline .dim {
    color: #9ca3af !important;
}

/* keep proxy buttons in DOM but invisible */
.proxy-btn { position:absolute; width:0; height:0; overflow:hidden; padding:0 !important; margin:0 !important; border:0 !important; opacity:0; pointer-events:none; }

/* Better contrast for parameter group boxes */
.gr-group {
    border: 1px solid #d1d5db !important;
    background: #f3f4f6 !important;
}
.dark .gr-group,
[data-theme="dark"] .gr-group {
    border: 1px solid #4b5563 !important;
    background: #1f2937 !important;
}

/* Highlight generated audio */
.generated-audio-player {
    border: 3px solid #667eea !important;
    border-radius: 12px !important;
    padding: 20px !important;
    background: linear-gradient(135deg, rgba(102, 126, 234, 0.08) 0%, rgba(118, 75, 162, 0.05) 100%) !important;
    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2) !important;
    margin: 1rem 0 !important;
}
.generated-audio-player > div {
    background: transparent !important;
}

/* Make Parameter Mode selector more prominent */
#component-mode-selector {
    text-align: center;
    padding: 1rem 0;
}
#component-mode-selector label {
    font-size: 1.1rem !important;
    font-weight: 600 !important;
    margin-bottom: 0.5rem !important;
}
#component-mode-selector .wrap {
    justify-content: center !important;
}
#component-mode-selector fieldset {
    border: 2px solid #e5e7eb !important;
    border-radius: 8px !important;
    padding: 1rem !important;
    background: #f9fafb !important;
}
.dark #component-mode-selector fieldset,
[data-theme="dark"] #component-mode-selector fieldset {
    border: 2px solid #4b5563 !important;
    background: #1f2937 !important;
}

/* Stronger section separators */
.section-separator {
    height: 3px !important;
    background: linear-gradient(90deg, transparent 0%, #667eea 20%, #764ba2 80%, transparent 100%) !important;
    border: none !important;
    margin: 2rem 0 !important;
}
.dark .section-separator,
[data-theme="dark"] .section-separator {
    background: linear-gradient(90deg, transparent 0%, #667eea 20%, #764ba2 80%, transparent 100%) !important;
}

/* Section headers styling */
.gradio-container h1,
.gradio-container h2 {
    font-weight: 700 !important;
    margin-top: 1.5rem !important;
    margin-bottom: 1rem !important;
}

/* Highlighted tip box */
.tip-box {
    background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%) !important;
    border-left: 4px solid #f59e0b !important;
    border-radius: 8px !important;
    padding: 1rem 1.5rem !important;
    margin: 1rem 0 !important;
    box-shadow: 0 2px 4px rgba(245, 158, 11, 0.1) !important;
}
.tip-box strong {
    color: #92400e !important;
}
.dark .tip-box,
[data-theme="dark"] .tip-box {
    background: linear-gradient(135deg, #451a03 0%, #78350f 100%) !important;
    border-left: 4px solid #f59e0b !important;
}
.dark .tip-box strong,
[data-theme="dark"] .tip-box strong {
    color: #fbbf24 !important;
}
"""

JS_CODE = r"""
function () {
  // Get a queryable root, regardless of Shadow DOM
  const appEl = document.querySelector("gradio-app");
  const root  = appEl && appEl.shadowRoot ? appEl.shadowRoot : document;

  function clickHiddenButtonById(id) {
    if (!id) return;
    const host = root.getElementById(id);
    if (!host) return;
    const realBtn = host.querySelector("button, [role='button']") || host;
    realBtn.click();
  }

  // Delegate clicks from any <a class="preset-link" data-fire="...">
  root.addEventListener("click", (ev) => {
    const a = ev.target.closest("a.preset-link");
    if (!a) return;
    ev.preventDefault();
    ev.stopPropagation();
    ev.stopImmediatePropagation();
    clickHiddenButtonById(a.getAttribute("data-fire"));
    return false;
  }, true);
}
"""

def init_session():
    """Initialize session ID for this browser tab/session."""
    return secrets.token_hex(8)

def init_and_compile():
    """Initialize session and trigger compilation on page load."""
    session_id = secrets.token_hex(8)
    
    # Trigger compilation automatically on page load if not on Zero GPU
    # This ensures Simple mode (which defaults compile=True) gets compiled
    if not IS_ZEROGPU:
        # Just call do_compile directly - it will load models and compile
        # Status updates will be visible in Advanced mode, hidden in Simple mode
        status_update, checkbox_update = do_compile()
        return session_id, status_update, checkbox_update
    else:
        # On Zero GPU, don't try to compile
        return session_id, gr.update(), gr.update()

with gr.Blocks(title="Echo-TTS", css=LINK_CSS, js=JS_CODE) as demo:
    gr.Markdown("# Echo-TTS")
    gr.Markdown("*Jordan Darefsky, 2025. See technical details [here](https://jordandarefsky.com/blog/2025/echo/)*")
    
    # License notice for Fish Speech autoencoder
    gr.Markdown("**License Notice:** All audio outputs are subject to non-commercial use [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/).")
    
    # Silentcipher watermarking notice
    if USE_SILENTCIPHER:
        gr.Markdown(f"*Audio output is watermarked with [silentcipher](https://github.com/sony/silentcipher) using message `{SILENTCIPHER_MESSAGE}`*")
    
    # Instructions for Simple Mode
    with gr.Accordion("📖 Quick Start Instructions", open=True):
        gr.Markdown("""
        ### Simple Mode (Recommended for Beginners)
        
        1. **Pick or upload a voice** - Choose from the voicebank or upload your own audio (up to 2 minutes)
        2. **Choose a text prompt preset or enter your own prompt** - What you want the voice to say (the presets are a good guide for format/style)
        3. **Select a Sampling preset (optional) ** - The default preset "Independent (High Speaker CFG)" is usually good to start
        4. **Click Generate Audio** - Wait for the model to generate your audio
        
        <div class="tip-box">
        
        💡 **Tip:** If the generated voice doesn't match the reference speaker at all, enable "Speaker KV Attention Scaling" and click Generate Audio again.
        
        </div>
        
        ### Advanced Mode
        
        Switch to Advanced mode for full control over all generation parameters including CFG scales, sampling steps, truncation, and more.

        ### Other tips

        High CFG settings are recommended but may lead to oversaturation; APG might help with this. Flat settings tend to reduce "impulse" artifacts but might result in worse (blunted/compressed/artifact-y) laughter, breathing, etc. generation.

        Echo will try to fit the entire text-prompt into (<=) 30 seconds of audio. If your prompt is very long, the generated speech may be too quick (this is not an issue for shorter text-prompts). For disfluent, single-speaker speech, we recommend trying the reference text beginning with "[S1] ... explore how we can design" as a starting point.
        """)
    
    # Session state for per-user file management
    session_id_state = gr.State(None)
    
    # Hidden state variables to store paths and selection
    selected_speaker_state = gr.Textbox(visible=False, value="")
    speaker_st_path_state = gr.Textbox(visible=False, value="")
    speaker_audio_path_state = gr.Textbox(visible=False, value="")
    
    gr.Markdown("# Voice Selection")
    
    # Dataset selector
    dataset_selector = gr.Radio(
        choices=["Custom Audio Panel", "EARS", "VCTK", "Expresso", "HF-Custom"],
        value="Custom Audio Panel",
        label="Select Dataset",
        info="Choose which voicebank to use"
    )
    
    dataset_license_info = gr.Markdown(
        "",
        visible=False
    )
    
    # Custom Audio Panel UI (visible by default, takes full width)
    with gr.Row(visible=True) as custom_audio_row:
        # Optional: Audio prompt library table (only shown if AUDIO_PROMPT_FOLDER is configured)
        if AUDIO_PROMPT_FOLDER is not None and AUDIO_PROMPT_FOLDER.exists():
            with gr.Column(scale=1, min_width=200):
                gr.Markdown("#### Audio Library (favorite examples from voicebank datasets)")
                audio_prompt_table = gr.Dataframe(
                    value=get_audio_prompt_files(),
                    headers=["Filename"],
                    datatype=["str"],
                    row_count=(10, "dynamic"),
                    col_count=(1, "fixed"),
                    interactive=False,
                    label="Click to select (or upload your own audio file directly on the right)"
                )
        
        with gr.Column(scale=2):
            custom_audio_input = gr.Audio(
                sources=["upload", "microphone"], 
                type="filepath", 
                label="Speaker Reference Audio (only first two minutes will be used; leave empty for zero speaker conditioning)",
                max_length=600  # Maximum duration in seconds (10 minutes)
            )
    
    with gr.Row(visible=False) as voicebank_row:
        # Voice selection UI for all voicebank datasets
        
        # EARS UI (visible by default when voicebank_row is shown)
        with gr.Column(scale=2, visible=True) as ears_column:
            gr.Markdown("### 1. Speakers (EARS)")
            selected_speaker_display = gr.Textbox(
                value="",
                label="",
                show_label=False,
                interactive=False,
                visible=False,
                lines=2,
                max_lines=2
            )
            speaker_search = gr.Textbox(
                placeholder="Search speakers (by ID, gender, age, ethnicity, language)...",
                label="",
                show_label=False,
                container=False
            )
            speakers_table = gr.Dataframe(
                value=get_speakers_table(),
                headers=["ID", "G", "Age", "Ethnicity", "Native Lang"],
                datatype=["str", "str", "str", "str", "str"],
                row_count=(8, "dynamic"),
                col_count=(5, "fixed"),
                interactive=False,
                label="Click any cell to select",
                column_widths=["10%", "8%", "15%", "30%", "37%"]
            )
        
        # VCTK UI (hidden by default)
        with gr.Column(scale=2, visible=False) as vctk_column:
            gr.Markdown("### 1. Speakers (VCTK)")
            vctk_speaker_display = gr.Textbox(
                value="",
                label="",
                show_label=False,
                interactive=False,
                visible=False,
                lines=2,
                max_lines=2
            )
            vctk_speaker_search = gr.Textbox(
                placeholder="Search speakers (by ID, gender, age, details)...",
                label="",
                show_label=False,
                container=False
            )
            vctk_speakers_table = gr.Dataframe(
                value=get_vctk_speakers_table(),
                headers=["ID", "G", "Age", "Details", "Length"],
                datatype=["str", "str", "str", "str", "str"],
                row_count=(8, "dynamic"),
                col_count=(5, "fixed"),
                interactive=False,
                label="Click any cell to select",
                column_widths=["10%", "8%", "12%", "50%", "20%"]
            )
        
        # Expresso UI (hidden by default)
        with gr.Column(scale=2, visible=False) as expresso_column:
            gr.Markdown("### 1. Voices (Expresso)")
            expresso_speaker_display = gr.Textbox(
                value="",
                label="",
                show_label=False,
                interactive=False,
                visible=False,
                lines=2,
                max_lines=2
            )
            expresso_speaker_search = gr.Textbox(
                placeholder="Search voices (by ID, type, speakers, style)...",
                label="",
                show_label=False,
                container=False
            )
            expresso_speakers_table = gr.Dataframe(
                value=get_expresso_speakers_table(),
                headers=["ID", "Type", "Speakers", "Style", "Length"],
                datatype=["str", "str", "str", "str", "str"],
                row_count=(8, "dynamic"),
                col_count=(5, "fixed"),
                interactive=False,
                label="Click any cell to select",
                column_widths=["35%", "15%", "15%", "15%", "20%"]
            )
        
        # HF-Custom UI (hidden by default)
        with gr.Column(scale=2, visible=False) as hf_custom_column:
            gr.Markdown("### 1. Voices (HF-Custom)")
            hf_custom_speaker_display = gr.Textbox(
                value="",
                label="",
                show_label=False,
                interactive=False,
                visible=False,
                lines=2,
                max_lines=2
            )
            hf_custom_speaker_search = gr.Textbox(
                placeholder="Search voices (by name, dataset, description)...",
                label="",
                show_label=False,
                container=False
            )
            hf_custom_speakers_table = gr.Dataframe(
                value=get_hf_custom_speakers_table(),
                headers=["Name", "Dataset", "Description", "Length"],
                datatype=["str", "str", "str", "str"],
                row_count=(8, "dynamic"),
                col_count=(4, "fixed"),
                interactive=False,
                label="Click any cell to select",
                column_widths=["15%", "15%", "50%", "20%"]
            )
        
        with gr.Column(scale=1, visible=True) as voice_type_column:
            gr.Markdown("### 2. Voice Type")
            selected_voice_display = gr.Textbox(
                value="",
                label="",
                show_label=False,
                interactive=False,
                visible=False,
                lines=2,
                max_lines=2
            )
            freeform_table = gr.Dataframe(
                value=[],
                headers=["Type", "Length"],
                datatype=["str", "str"],
                row_count=(1, "fixed"),
                col_count=(2, "fixed"),
                interactive=False,
                label="Freeform voice",
                visible=True,
                column_widths=["60%", "40%"]
            )
            gr.Markdown("**Emotions:**")
            emotions_table = gr.Dataframe(
                value=[],
                headers=["Emotion", "Length"],
                datatype=["str", "str"],
                row_count=(8, "dynamic"),
                col_count=(2, "fixed"),
                interactive=False,
                visible=True,
                column_widths=["60%", "40%"]
            )
        
        with gr.Column(scale=1):
            gr.Markdown("### 3. Audio Preview")
            audio_preview = gr.Audio(label="Voice Sample", type="filepath", interactive=False)

    gr.HTML('<hr class="section-separator">')
    gr.Markdown("# Text Prompt")
    with gr.Accordion("Text Presets", open=True):
        text_presets_table = gr.Dataframe(
            value=load_text_presets(),
            headers=["Category", "Words", "Preset Text"],
            datatype=["str", "str", "str"],
            row_count=(3, "dynamic"),
            col_count=(3, "fixed"),
            interactive=False,
            column_widths=["12%", "6%", "82%"]
        )
    text_prompt = gr.Textbox(
        label="Text Prompt", 
        placeholder="[S1] Enter your text prompt here...", 
        lines=4
    )
    
    gr.HTML('<hr class="section-separator">')
    gr.Markdown("# Generation")
    
    # Mode selector: Simple or Advanced (outside the accordion, centered and prominent)
    with gr.Row():
        with gr.Column(scale=1):
            pass  # Empty column for spacing
        with gr.Column(scale=2):
            mode_selector = gr.Radio(
                choices=["Simple Mode", "Advanced Mode"],
                value="Simple Mode",
                label="",
                info=None,
                elem_id="component-mode-selector"
            )
        with gr.Column(scale=1):
            pass  # Empty column for spacing
    
    with gr.Accordion("⚙️ Generation Parameters", open=True):
        
        with gr.Row():
            presets = load_sampler_presets()
            preset_keys = list(presets.keys())
            first_preset = preset_keys[0] if preset_keys else "Custom"
            
            preset_dropdown = gr.Dropdown(
                choices=["Custom"] + preset_keys,
                value=first_preset,  # Default to first preset instead of Custom
                label="Sampler Preset",
                info="Load preset configurations",
                scale=2
            )
            
            rng_seed = gr.Number(
                label="RNG Seed", 
                value=0, 
                info="Random seed for starting noise", 
                precision=0,
                scale=1
            )
            
            # Simple mode: Speaker KV checkbox on same row (visible by default)
            with gr.Column(scale=1, visible=True) as simple_mode_row:
                speaker_kv_simple_checkbox = gr.Checkbox(
                    label="\"Force Speaker\" (Enable Speaker KV Attention Scaling)",
                    value=False,
                    info="Enable if generation does not match reference voice (otherwise leave off)"
                )
            
            # Advanced mode: Compile and custom shapes checkboxes (hidden by default)
            with gr.Column(scale=1, visible=False) as advanced_mode_compile_column:
                compile_checkbox = gr.Checkbox(
                    label="Compile Model", 
                    value=True,  # Default to True in simple mode
                    interactive=not IS_ZEROGPU,
                    info="Compile disabled on Zero GPU" if IS_ZEROGPU else "~20-30% faster after initial compilation"
                )
                compile_status = gr.Markdown(
                    value="⚠️ Compile disabled on Zero GPU" if IS_ZEROGPU else "",
                    visible=IS_ZEROGPU
                )
                use_custom_shapes_checkbox = gr.Checkbox(
                    label="Use Custom Shapes (Advanced)",
                    value=False,
                    info="Override default sequence lengths for text, speaker, and sample"
                )
        
        # Advanced mode controls (hidden by default)
        with gr.Column(visible=False) as advanced_mode_column:
            with gr.Row(visible=False) as custom_shapes_row:
                max_text_byte_length = gr.Textbox(
                    label="Max Text Byte Length (padded)",
                    value="768",
                    info="Maximum text utf-8 byte sequence length (blank -> no padding)",
                    scale=1
                )
                max_speaker_latent_length = gr.Textbox(
                    label="Max Speaker Latent Length (padded)",
                    value="2560",
                    info="Maximum (unpatched)speaker latent length (blank -> no padding), default 2560 = ~30s",
                    scale=1
                )
                sample_latent_len = gr.Textbox(
                    label="Sample Latent Length",
                    value="640",
                    info="Maximum sample latent length (EXPERIMENTAL!!! ONLY TRAINED WITH 640 BUT SOMEHOW WORKS WITH < 640 TO GENERATE PREFIXES)",
                    scale=1
                )
            
            
            with gr.Row():
                # Left column: Core Sampling Parameters
                with gr.Column(scale=1):
                    with gr.Group():
                        gr.HTML("""
                        <div class="preset-inline">
                          <span class="title">Core Sampling Parameters</span><span class="dim">(</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="core_default">default</a>
                          <span class="dim">)</span>
                        </div>
                        """)
                        core_preset_default = gr.Button("", elem_id="core_default", elem_classes=["proxy-btn"])
                        num_steps = gr.Number(label="Number of Steps", value=40, info="Number of sampling steps (consider 20 - 80) (capped at 80)", precision=0, minimum=1, step=5, maximum=80)
                        
                        cfg_mode = gr.Radio(
                            choices=[
                                "independent",
                                "apg-independent",
                                "alternating",
                                "joint-unconditional"
                            ],
                            value="independent",
                            label="CFG Mode",
                            info="Independent (3 NFE), Adaptive Projected Guidance (3 NFE, see https://arxiv.org/abs/2410.02416), Alternating (2 NFE), Joint-Unconditional (2 NFE)"
                        )
                    
                    with gr.Group():
                        gr.HTML("""
                        <div class="preset-inline">
                          <span class="title">CFG Guidance</span><span class="dim">(</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="cfg_default">default</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="cfg_higher">higher speaker</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="cfg_large">large guidances(works with apg)</a>
                          <span class="dim">)</span>
                        </div>
                        """)
                        cfg_preset_default = gr.Button("", elem_id="cfg_default", elem_classes=["proxy-btn"])
                        cfg_preset_higher_speaker = gr.Button("", elem_id="cfg_higher", elem_classes=["proxy-btn"])
                        cfg_preset_large_guidances = gr.Button("", elem_id="cfg_large", elem_classes=["proxy-btn"])
                        with gr.Row():
                            cfg_scale_text = gr.Number(label="Text CFG Scale", value=3.0, info="Guidance strength for text", minimum=0, step=0.5)
                            cfg_scale_speaker = gr.Number(label="Speaker CFG Scale", value=5.0, info="Guidance strength for speaker", minimum=0, step=0.5)
                        
                        with gr.Row():
                            cfg_min_t = gr.Number(label="CFG Min t", value=0.5, info="(0-1), CFG applied when t >= val", minimum=0, maximum=1, step=0.05)
                            cfg_max_t = gr.Number(label="CFG Max t", value=1.0, info="(0-1), CFG applied when t <= val", minimum=0, maximum=1, step=0.05)
                
                # Right column: Speaker KV, Truncation + APG
                with gr.Column(scale=1):
                    with gr.Group():
                        gr.HTML("""
                        <div class="preset-inline">
                          <span class="title">Speaker KV Attention Scaling</span><span class="dim">(</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_enable">enable if generation does not match reference</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_off">off</a>
                          <span class="dim">)</span>
                        </div>
                        """)
                        spk_kv_preset_enable = gr.Button("", elem_id="spk_kv_enable", elem_classes=["proxy-btn"])
                        spk_kv_preset_off = gr.Button("", elem_id="spk_kv_off", elem_classes=["proxy-btn"])
                        speaker_k_enable = gr.Checkbox(label="Enable Speaker KV Scaling", value=False, info="Scale speaker attention key-values; useful when the model-generated audio does not at all match the reference audio (i.e. ignores speaker-reference)")
                        
                        with gr.Row(visible=False) as speaker_k_row:
                            speaker_k_scale = gr.Number(label="KV Scale", value=1.5, info="Scale factor", minimum=0, step=0.1)
                            speaker_k_min_t = gr.Number(label="KV Min t", value=0.9, info="(0-1), scale applied from steps t=1. to val", minimum=0, maximum=1, step=0.05)
                            speaker_k_max_layers = gr.Number(label="Max Layers", value=24, info="(0-24), scale applied in first N layers", precision=0, minimum=0, maximum=24)
                    
                    with gr.Group():
                        gr.HTML("""
                        <div class="preset-inline">
                          <span class="title">Truncation &amp; Temporal Rescaling</span><span class="dim">(</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="trunc_flat">flat</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="trunc_sharp">sharp</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="trunc_baseline">baseline(sharp)</a>
                          <span class="dim">)</span>
                        </div>
                        """)
                        trunc_preset_flat = gr.Button("", elem_id="trunc_flat", elem_classes=["proxy-btn"])
                        trunc_preset_sharp = gr.Button("", elem_id="trunc_sharp", elem_classes=["proxy-btn"])
                        trunc_preset_baseline = gr.Button("", elem_id="trunc_baseline", elem_classes=["proxy-btn"])
                        with gr.Row():
                            truncation_factor = gr.Number(label="Truncation Factor", value=0.8, info="Multiply initial noise (<1 helps artifacts)", minimum=0, step=0.05)
                            rescale_k = gr.Number(label="Rescale k", value=1.2, info="<1=sharpen, >1=flatten, 1=off", minimum=0, step=0.05)
                            rescale_sigma = gr.Number(label="Rescale σ", value=3.0, info="Sigma parameter", minimum=0, step=0.1)
                    
                    with gr.Group(visible=False) as apg_row:
                        gr.HTML("""
                        <div class="preset-inline">
                          <span class="title">APG Parameters</span><span class="dim">(</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="apg_default">default</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_momentum">no momentum</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="apg_norms">norms</a>
                          <span class="dim">,</span>
                          <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_eta">no eta</a>
                          <span class="dim">)</span>
                        </div>
                        """)
                        apg_preset_default = gr.Button("", elem_id="apg_default", elem_classes=["proxy-btn"])
                        apg_preset_no_momentum = gr.Button("", elem_id="apg_no_momentum", elem_classes=["proxy-btn"])
                        apg_preset_norms = gr.Button("", elem_id="apg_norms", elem_classes=["proxy-btn"])
                        apg_preset_no_eta = gr.Button("", elem_id="apg_no_eta", elem_classes=["proxy-btn"])
                        with gr.Row():
                            apg_eta_text = gr.Number(label="APG η (text)", value=0.5, info="Eta for text projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
                            apg_eta_speaker = gr.Number(label="APG η (speaker)", value=0.5, info="Eta for speaker projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25)
                        
                        with gr.Row() as apg_row2:
                            apg_momentum_text = gr.Number(label="APG Momentum (text)", value=-0.25, info="Text momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
                            apg_momentum_speaker = gr.Number(label="APG Momentum (speaker)", value=-0.25, info="Speaker momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25)
                        with gr.Row():
                            apg_norm_text = gr.Textbox(label="APG Norm (text)", value="", info="Text norm clip (leave blank to disable, can try 7.5, 15.0)")
                            apg_norm_speaker = gr.Textbox(label="APG Norm (speaker)", value="", info="Speaker norm clip (leave blank to disable, can try 7.5, 15.0)")
            # End of advanced_mode_column
    
    with gr.Row(equal_height=True):
        audio_format = gr.Radio(
            choices=["wav", "mp3"],
            value="wav",
            label="Format",
            scale=1,
            min_width=90
        )
        generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", scale=10)
        with gr.Column(scale=1):
            show_original_audio = gr.Checkbox(
                label="Re-display original audio (full 2-minute cropped mono)",
                value=False
            )
            reconstruct_first_30_seconds = gr.Checkbox(
                label="Show Autoencoder Reconstruction (only first 30s of reference)",
                value=False
            )

    gr.HTML('<hr class="section-separator">')
    with gr.Accordion("Generated Audio", open=True, visible=True) as generated_section:
        generation_time_display = gr.Markdown("", visible=False)
        with gr.Group(elem_classes=["generated-audio-player"]):
            generated_audio = gr.Audio(label="Generated Audio", visible=True)
        text_prompt_display = gr.Markdown("", visible=False)
        
        gr.Markdown("---")
        reference_audio_header = gr.Markdown("#### Reference Audio", visible=False)
        
        with gr.Accordion("Original Audio (2 min Cropped Mono)", open=False, visible=False) as original_accordion:
            original_audio = gr.Audio(label="Original Reference Audio (2 min)", visible=True)
        
        with gr.Accordion("Autoencoder Reconstruction of First 30s of Reference", open=False, visible=False) as reference_accordion:
            reference_audio = gr.Audio(label="Decoded Reference Audio (30s)", visible=True)

    # Event handlers
    # Custom Audio Panel - handle audio change to update speaker_audio_path_state
    custom_audio_input.change(
        lambda audio: gr.update(value=audio if audio else ""),
        inputs=[custom_audio_input],
        outputs=[speaker_audio_path_state]
    )
    
    # Audio prompt library table selection (only if configured)
    if AUDIO_PROMPT_FOLDER is not None and AUDIO_PROMPT_FOLDER.exists():
        audio_prompt_table.select(
            select_audio_prompt_file,
            outputs=[custom_audio_input]
        )
    
    # Dataset selector: switch between Custom Audio Panel, EARS, VCTK, Expresso, and HF-Custom
    dataset_selector.change(
        switch_dataset,
        inputs=[dataset_selector],
        outputs=[
            dataset_license_info, custom_audio_row, voicebank_row, voice_type_column,
            ears_column, vctk_column, expresso_column, hf_custom_column,
            selected_speaker_display, freeform_table, emotions_table,
            selected_voice_display, vctk_speaker_display, expresso_speaker_display, hf_custom_speaker_display, selected_speaker_state,
            audio_preview, speaker_st_path_state, speaker_audio_path_state
        ]
    )
    
    # EARS: Speaker search
    speaker_search.change(
        search_speakers,
        inputs=[speaker_search],
        outputs=[speakers_table]
    )
    
    # EARS: Speaker selection - populate freeform and emotions tables
    speakers_table.select(
        select_speaker_from_table,
        inputs=[speakers_table],
        outputs=[selected_speaker_display, freeform_table, emotions_table, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state, selected_voice_display]
    )
    
    # VCTK: Speaker search
    vctk_speaker_search.change(
        search_vctk_speakers,
        inputs=[vctk_speaker_search],
        outputs=[vctk_speakers_table]
    )
    
    # VCTK: Speaker selection - load voice files directly
    vctk_speakers_table.select(
        select_vctk_speaker_from_table,
        inputs=[vctk_speakers_table],
        outputs=[vctk_speaker_display, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state]
    )
    
    # Expresso: Speaker search
    expresso_speaker_search.change(
        search_expresso_speakers,
        inputs=[expresso_speaker_search],
        outputs=[expresso_speakers_table]
    )
    
    # Expresso: Speaker selection - load voice files directly
    expresso_speakers_table.select(
        select_expresso_speaker_from_table,
        inputs=[expresso_speakers_table],
        outputs=[expresso_speaker_display, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state]
    )
    
    # HF-Custom: Speaker search
    hf_custom_speaker_search.change(
        search_hf_custom_speakers,
        inputs=[hf_custom_speaker_search],
        outputs=[hf_custom_speakers_table]
    )
    
    # HF-Custom: Speaker selection - load voice files directly
    hf_custom_speakers_table.select(
        select_hf_custom_speaker_from_table,
        inputs=[hf_custom_speakers_table],
        outputs=[hf_custom_speaker_display, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state]
    )
    
    # Freeform selection: load freeform voice files
    freeform_table.select(
        select_freeform_from_table,
        inputs=[selected_speaker_state],
        outputs=[selected_voice_display, audio_preview, speaker_st_path_state, speaker_audio_path_state]
    )
    
    # Emotion selection: load voice files
    emotions_table.select(
        select_emotion_from_table,
        inputs=[selected_speaker_state],
        outputs=[selected_voice_display, audio_preview, speaker_st_path_state, speaker_audio_path_state]
    )
    
    text_presets_table.select(select_text_preset, outputs=text_prompt)
    
    # Mode selector handler
    mode_selector.change(
        toggle_mode,
        inputs=[mode_selector, speaker_k_enable, speaker_kv_simple_checkbox],
        outputs=[simple_mode_row, advanced_mode_compile_column, advanced_mode_column, speaker_kv_simple_checkbox, speaker_k_enable]
    ).then(
        # Sync the row visibility and values after mode switch
        lambda enabled: (gr.update(visible=enabled), gr.update(value=1.5 if enabled else 1.5), gr.update(value=0.9 if enabled else 0.9), gr.update(value=24 if enabled else 24)),
        inputs=[speaker_k_enable],
        outputs=[speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers]
    )
    
    # Simple mode speaker KV checkbox handler
    speaker_kv_simple_checkbox.change(
        sync_simple_to_advanced,
        inputs=[speaker_kv_simple_checkbox],
        outputs=[speaker_k_enable, speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers]
    )
    
    cfg_mode.change(update_cfg_visibility, inputs=cfg_mode, outputs=[cfg_scale_text, cfg_scale_speaker, apg_row])
    
    # Speaker K enable handler - toggle row visibility and sync with simple mode
    speaker_k_enable.change(
        lambda enabled: (gr.update(visible=enabled), gr.update(value=enabled)),
        inputs=[speaker_k_enable],
        outputs=[speaker_k_row, speaker_kv_simple_checkbox]
    )
    
    # Custom shapes enable handler - toggle row visibility and reset to defaults on disable
    def toggle_custom_shapes(enabled):
        if enabled:
            return (
                gr.update(visible=True),
                gr.update(),
                gr.update(),
                gr.update(),
            )
        else:
            return (
                gr.update(visible=False),
                gr.update(value="768"),
                gr.update(value="2560"),
                gr.update(value="640"),
            )
    
    use_custom_shapes_checkbox.change(
        toggle_custom_shapes,
        inputs=[use_custom_shapes_checkbox],
        outputs=[custom_shapes_row, max_text_byte_length, max_speaker_latent_length, sample_latent_len]
    )
    
    # Core preset handler
    core_preset_default.click(
        lambda: apply_core_preset("default"),
        outputs=[rng_seed, num_steps, cfg_mode, preset_dropdown]
    )
    
    # CFG preset handlers
    cfg_preset_default.click(
        lambda: apply_cfg_preset("default"),
        outputs=[cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, preset_dropdown]
    )
    cfg_preset_higher_speaker.click(
        lambda: apply_cfg_preset("higher speaker"),
        outputs=[cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, preset_dropdown]
    )
    cfg_preset_large_guidances.click(
        lambda: apply_cfg_preset("large guidances"),
        outputs=[cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, preset_dropdown]
    )
    
    # Speaker KV preset handlers
    spk_kv_preset_enable.click(
        lambda: apply_speaker_kv_preset("enable"),
        outputs=[speaker_k_enable, speaker_k_row, preset_dropdown]
    )
    spk_kv_preset_off.click(
        lambda: apply_speaker_kv_preset("off"),
        outputs=[speaker_k_enable, speaker_k_row, preset_dropdown]
    )
    
    # Truncation preset handlers
    trunc_preset_flat.click(
        lambda: apply_truncation_preset("flat"),
        outputs=[truncation_factor, rescale_k, rescale_sigma, preset_dropdown]
    )
    trunc_preset_sharp.click(
        lambda: apply_truncation_preset("sharp"),
        outputs=[truncation_factor, rescale_k, rescale_sigma, preset_dropdown]
    )
    trunc_preset_baseline.click(
        lambda: apply_truncation_preset("baseline(sharp)"),
        outputs=[truncation_factor, rescale_k, rescale_sigma, preset_dropdown]
    )
    
    # APG preset handlers
    apg_preset_default.click(
        lambda: apply_apg_preset("default"),
        outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown]
    )
    apg_preset_no_momentum.click(
        lambda: apply_apg_preset("no momentum"),
        outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown]
    )
    apg_preset_norms.click(
        lambda: apply_apg_preset("norms"),
        outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown]
    )
    apg_preset_no_eta.click(
        lambda: apply_apg_preset("no eta"),
        outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown]
    )
    
    # Preset handler
    preset_dropdown.change(
        apply_sampler_preset,
        inputs=preset_dropdown,
        outputs=[num_steps, cfg_mode, cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, truncation_factor, rescale_k, rescale_sigma, speaker_k_enable, speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers, apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker]
    )
    
    # Compile handler
    compile_checkbox.change(
        compile_model,
        inputs=compile_checkbox,
        outputs=[compile_checkbox, compile_status]
    ).then(
        do_compile,
        outputs=[compile_status, compile_checkbox]
    )
    
    generate_btn.click(
        generate_audio,
        inputs=[
            text_prompt,
            speaker_st_path_state,
            speaker_audio_path_state,
            num_steps,
            rng_seed,
            cfg_mode,
            cfg_scale_text,
            cfg_scale_speaker,
            cfg_min_t,
            cfg_max_t,
            truncation_factor,
            rescale_k,
            rescale_sigma,
            speaker_k_enable,
            speaker_k_scale,
            speaker_k_min_t,
            speaker_k_max_layers,
            apg_eta_text,
            apg_eta_speaker,
            apg_momentum_text,
            apg_momentum_speaker,
            apg_norm_text,
            apg_norm_speaker,
            reconstruct_first_30_seconds,
            use_custom_shapes_checkbox,
            max_text_byte_length,
            max_speaker_latent_length,
            sample_latent_len,
            audio_format,
            compile_checkbox,  # Pass compile state to choose model
            show_original_audio,
            session_id_state,
        ],
        outputs=[generated_section, generated_audio, text_prompt_display, original_audio, generation_time_display, reference_audio, original_accordion, reference_accordion, reference_audio_header]
    )
    
    # Initialize session ID and trigger compilation when the page loads
    demo.load(init_and_compile, outputs=[session_id_state, compile_status, compile_checkbox]).then(
        # Apply the first preset on load
        lambda: apply_sampler_preset(list(load_sampler_presets().keys())[0]),
        outputs=[num_steps, cfg_mode, cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, truncation_factor, rescale_k, rescale_sigma, speaker_k_enable, speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers, apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker]
    )


if __name__ == "__main__":
    # For HF-Custom, allow the entire dataset cache directory to handle subdirectories
    hf_custom_cache = HF_CUSTOM_PATH.parent.parent.parent
    allowed_paths = [
        str(EARS_PATH), 
        str(VCTK_PATH), 
        str(EXPRESSO_PATH), 
        str(hf_custom_cache), 
        str(TEMP_AUDIO_DIR),
        str(AUDIO_PROMPT_FOLDER)
    ]
    
    # Enable queue for better handling of concurrent requests on HF Spaces
    demo.queue(max_size=20)
    demo.launch(allowed_paths=allowed_paths)