import os import numpy as np import torch import gradio as gr import spaces from typing import Optional, Tuple from pathlib import Path import tempfile import soundfile as sf os.environ["TOKENIZERS_PARALLELISM"] = "false" if os.environ.get("HF_REPO_ID", "").strip() == "": os.environ["HF_REPO_ID"] = "openbmb/VoxCPM1.5" # Global model cache for ZeroGPU _asr_model = None _voxcpm_model = None _default_local_model_dir = "./models/VoxCPM1.5" def _resolve_model_dir() -> str: """ Resolve model directory: 1) Use local checkpoint directory if exists 2) If HF_REPO_ID env is set, download into models/{repo} 3) Fallback to 'models' """ if os.path.isdir(_default_local_model_dir): return _default_local_model_dir repo_id = os.environ.get("HF_REPO_ID", "").strip() if len(repo_id) > 0: target_dir = os.path.join("models", repo_id.replace("/", "__")) if not os.path.isdir(target_dir): try: from huggingface_hub import snapshot_download os.makedirs(target_dir, exist_ok=True) print(f"Downloading model from HF repo '{repo_id}' to '{target_dir}' ...") snapshot_download(repo_id=repo_id, local_dir=target_dir, local_dir_use_symlinks=False) except Exception as e: print(f"Warning: HF download failed: {e}. Falling back to 'models'.") return "models" return target_dir return "models" def get_asr_model(): """Lazy load ASR model.""" global _asr_model if _asr_model is None: from funasr import AutoModel print("Loading ASR model...") # Set ModelScope cache directory for persistence cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "modelscope") os.makedirs(cache_dir, exist_ok=True) os.environ["MODELSCOPE_CACHE"] = cache_dir _asr_model = AutoModel( model="iic/SenseVoiceSmall", # ModelScope model ID hub="ms", # Use ModelScope Hub disable_update=True, log_level='INFO', device="cuda:0", ) print("ASR model loaded.") return _asr_model def get_voxcpm_model(): """Lazy load VoxCPM model.""" global _voxcpm_model if _voxcpm_model is None: import voxcpm print("Loading VoxCPM model...") model_dir = _resolve_model_dir() print(f"Using model dir: {model_dir}") _voxcpm_model = voxcpm.VoxCPM(voxcpm_model_path=model_dir, optimize=False) print("VoxCPM model loaded.") return _voxcpm_model @spaces.GPU(duration=120) def prompt_wav_recognition(prompt_wav: Optional[str]) -> str: """Use ASR to recognize prompt audio text.""" if prompt_wav is None or not prompt_wav.strip(): return "" asr_model = get_asr_model() res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True) text = res[0]["text"].split('|>')[-1] return text @spaces.GPU(duration=120) def generate_tts_audio_gpu( text_input: str, prompt_wav_data: Optional[Tuple[np.ndarray, int]] = None, prompt_text_input: Optional[str] = None, cfg_value_input: float = 2.0, inference_timesteps_input: int = 10, do_normalize: bool = True, denoise: bool = True, ) -> Tuple[int, np.ndarray]: """ GPU function: Generate speech from text using VoxCPM. prompt_wav_data is (audio_array, sample_rate) tuple. """ voxcpm_model = get_voxcpm_model() text = (text_input or "").strip() if len(text) == 0: raise ValueError("Please input text to synthesize.") prompt_text = prompt_text_input if prompt_text_input else None prompt_wav_path = None # If prompt audio data provided, write to temp file for voxcpm if prompt_wav_data is not None: audio_array, sr = prompt_wav_data with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, audio_array, sr) prompt_wav_path = f.name try: print(f"Generating audio for text: '{text[:60]}...'") wav = voxcpm_model.generate( text=text, prompt_text=prompt_text, prompt_wav_path=prompt_wav_path, cfg_value=float(cfg_value_input), inference_timesteps=int(inference_timesteps_input), normalize=do_normalize, denoise=denoise, ) return (voxcpm_model.tts_model.sample_rate, wav) finally: # Cleanup temp file if prompt_wav_path and os.path.exists(prompt_wav_path): try: os.unlink(prompt_wav_path) except Exception: pass def generate_tts_audio( text_input: str, prompt_wav_path_input: Optional[str] = None, prompt_text_input: Optional[str] = None, cfg_value_input: float = 2.0, inference_timesteps_input: int = 10, do_normalize: bool = True, denoise: bool = True, ) -> Tuple[int, np.ndarray]: """ Wrapper: Read audio file in CPU, then call GPU function. """ prompt_wav_data = None # Read audio file before entering GPU context if prompt_wav_path_input and os.path.exists(prompt_wav_path_input): try: audio_array, sr = sf.read(prompt_wav_path_input, dtype='float32') prompt_wav_data = (audio_array, sr) print(f"Loaded prompt audio: {audio_array.shape}, sr={sr}") except Exception as e: print(f"Warning: Failed to load prompt audio: {e}") prompt_wav_data = None return generate_tts_audio_gpu( text_input=text_input, prompt_wav_data=prompt_wav_data, prompt_text_input=prompt_text_input, cfg_value_input=cfg_value_input, inference_timesteps_input=inference_timesteps_input, do_normalize=do_normalize, denoise=denoise, ) # ---------- UI Builders ---------- def create_demo_interface(): """Build the Gradio UI for VoxCPM demo.""" # static assets (logo path) try: gr.set_static_paths(paths=[Path.cwd().absolute()/"assets"]) except Exception: pass with gr.Blocks( theme=gr.themes.Soft( primary_hue="blue", secondary_hue="gray", neutral_hue="slate", font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"] ), css=""" .logo-container { text-align: center; margin: 0.5rem 0 1rem 0; } .logo-container img { height: 80px; width: auto; max-width: 200px; display: inline-block; } /* Bold accordion labels */ #acc_quick details > summary, #acc_tips details > summary { font-weight: 600 !important; font-size: 1.1em !important; } /* Bold labels for specific checkboxes */ #chk_denoise label, #chk_denoise span, #chk_normalize label, #chk_normalize span { font-weight: 600; } """ ) as interface: # Header logo gr.HTML('
