Spaces:
Running
on
Zero
Running
on
Zero
刘鑫
commited on
Commit
·
3c79c15
1
Parent(s):
d75344d
set zero gpu inference
Browse files
app.py
CHANGED
|
@@ -42,7 +42,6 @@ if os.environ.get("HF_REPO_ID", "").strip() == "":
|
|
| 42 |
_asr_model = None
|
| 43 |
_voxcpm_model = None
|
| 44 |
_default_local_model_dir = "./models/VoxCPM1.5"
|
| 45 |
-
_zipenhancer_local_path = None # Will be set after pre-download
|
| 46 |
|
| 47 |
|
| 48 |
def predownload_models():
|
|
@@ -50,36 +49,19 @@ def predownload_models():
|
|
| 50 |
Pre-download models at startup (runs in main process, not GPU worker).
|
| 51 |
This ensures models are cached before GPU functions are called.
|
| 52 |
"""
|
| 53 |
-
global _zipenhancer_local_path
|
| 54 |
-
|
| 55 |
print("=" * 50)
|
| 56 |
print("Pre-downloading models to cache...")
|
| 57 |
-
print(f"MODELSCOPE_CACHE={os.environ.get('MODELSCOPE_CACHE')}")
|
| 58 |
print(f"HF_HOME={os.environ.get('HF_HOME')}")
|
| 59 |
print("=" * 50)
|
| 60 |
|
| 61 |
-
# Pre-download
|
| 62 |
-
try:
|
| 63 |
-
from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot_download
|
| 64 |
-
zipenhancer_model_id = "iic/speech_zipenhancer_ans_multiloss_16k_base"
|
| 65 |
-
print(f"Pre-downloading ZipEnhancer: {zipenhancer_model_id}")
|
| 66 |
-
_zipenhancer_local_path = ms_snapshot_download(
|
| 67 |
-
zipenhancer_model_id,
|
| 68 |
-
cache_dir=os.environ.get("MODELSCOPE_CACHE"),
|
| 69 |
-
)
|
| 70 |
-
print(f"ZipEnhancer downloaded to: {_zipenhancer_local_path}")
|
| 71 |
-
except Exception as e:
|
| 72 |
-
print(f"Warning: Failed to pre-download ZipEnhancer: {e}")
|
| 73 |
-
_zipenhancer_local_path = None
|
| 74 |
-
|
| 75 |
-
# Pre-download ASR model (SenseVoice) from ModelScope
|
| 76 |
try:
|
| 77 |
-
from
|
| 78 |
-
asr_model_id = "
|
| 79 |
print(f"Pre-downloading ASR model: {asr_model_id}")
|
| 80 |
-
asr_local_path =
|
| 81 |
asr_model_id,
|
| 82 |
-
cache_dir=os.environ.get("
|
| 83 |
)
|
| 84 |
print(f"ASR model downloaded to: {asr_local_path}")
|
| 85 |
except Exception as e:
|
|
@@ -121,18 +103,17 @@ def _resolve_model_dir() -> str:
|
|
| 121 |
|
| 122 |
|
| 123 |
def get_asr_model():
|
| 124 |
-
"""Lazy load ASR model."""
|
| 125 |
global _asr_model
|
| 126 |
if _asr_model is None:
|
| 127 |
-
# Setup cache env in GPU worker context
|
| 128 |
setup_cache_env()
|
| 129 |
|
| 130 |
from funasr import AutoModel
|
| 131 |
print("Loading ASR model...")
|
| 132 |
-
print(f"
|
| 133 |
_asr_model = AutoModel(
|
| 134 |
-
model="
|
| 135 |
-
hub="
|
| 136 |
disable_update=True,
|
| 137 |
log_level='INFO',
|
| 138 |
device="cuda:0",
|
|
@@ -141,31 +122,10 @@ def get_asr_model():
|
|
| 141 |
return _asr_model
|
| 142 |
|
| 143 |
|
| 144 |
-
def _get_zipenhancer_local_path():
|
| 145 |
-
"""
|
| 146 |
-
Get ZipEnhancer local path from ModelScope cache.
|
| 147 |
-
This works in both main process and GPU worker.
|
| 148 |
-
"""
|
| 149 |
-
setup_cache_env()
|
| 150 |
-
try:
|
| 151 |
-
from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot_download
|
| 152 |
-
zipenhancer_model_id = "iic/speech_zipenhancer_ans_multiloss_16k_base"
|
| 153 |
-
# This will use cache if already downloaded
|
| 154 |
-
local_path = ms_snapshot_download(
|
| 155 |
-
zipenhancer_model_id,
|
| 156 |
-
cache_dir=os.environ.get("MODELSCOPE_CACHE"),
|
| 157 |
-
)
|
| 158 |
-
return local_path
|
| 159 |
-
except Exception as e:
|
| 160 |
-
print(f"Warning: Failed to get ZipEnhancer path: {e}")
|
| 161 |
-
return "iic/speech_zipenhancer_ans_multiloss_16k_base"
|
| 162 |
-
|
| 163 |
-
|
| 164 |
def get_voxcpm_model():
|
| 165 |
-
"""Lazy load VoxCPM model."""
|
| 166 |
global _voxcpm_model
|
| 167 |
if _voxcpm_model is None:
|
| 168 |
-
# Setup cache env in GPU worker context
|
| 169 |
setup_cache_env()
|
| 170 |
|
| 171 |
import voxcpm
|
|
@@ -173,15 +133,10 @@ def get_voxcpm_model():
|
|
| 173 |
model_dir = _resolve_model_dir()
|
| 174 |
print(f"Using model dir: {model_dir}")
|
| 175 |
|
| 176 |
-
# Get ZipEnhancer local path (uses cache if pre-downloaded)
|
| 177 |
-
zipenhancer_path = _get_zipenhancer_local_path()
|
| 178 |
-
print(f"ZipEnhancer path: {zipenhancer_path}")
|
| 179 |
-
|
| 180 |
_voxcpm_model = voxcpm.VoxCPM(
|
| 181 |
voxcpm_model_path=model_dir,
|
| 182 |
optimize=True,
|
| 183 |
-
enable_denoiser=
|
| 184 |
-
zipenhancer_model_path=zipenhancer_path,
|
| 185 |
)
|
| 186 |
print("VoxCPM model loaded.")
|
| 187 |
return _voxcpm_model
|
|
@@ -206,7 +161,6 @@ def generate_tts_audio_gpu(
|
|
| 206 |
cfg_value_input: float = 2.0,
|
| 207 |
inference_timesteps_input: int = 10,
|
| 208 |
do_normalize: bool = True,
|
| 209 |
-
denoise: bool = True,
|
| 210 |
) -> Tuple[int, np.ndarray]:
|
| 211 |
"""
|
| 212 |
GPU function: Generate speech from text using VoxCPM.
|
|
@@ -237,7 +191,7 @@ def generate_tts_audio_gpu(
|
|
| 237 |
cfg_value=float(cfg_value_input),
|
| 238 |
inference_timesteps=int(inference_timesteps_input),
|
| 239 |
normalize=do_normalize,
|
| 240 |
-
denoise=
|
| 241 |
)
|
| 242 |
return (voxcpm_model.tts_model.sample_rate, wav)
|
| 243 |
finally:
|
|
@@ -256,7 +210,6 @@ def generate_tts_audio(
|
|
| 256 |
cfg_value_input: float = 2.0,
|
| 257 |
inference_timesteps_input: int = 10,
|
| 258 |
do_normalize: bool = True,
|
| 259 |
-
denoise: bool = True,
|
| 260 |
) -> Tuple[int, np.ndarray]:
|
| 261 |
"""
|
| 262 |
Wrapper: Read audio file in CPU, then call GPU function.
|
|
@@ -280,7 +233,6 @@ def generate_tts_audio(
|
|
| 280 |
cfg_value_input=cfg_value_input,
|
| 281 |
inference_timesteps_input=inference_timesteps_input,
|
| 282 |
do_normalize=do_normalize,
|
| 283 |
-
denoise=denoise,
|
| 284 |
)
|
| 285 |
|
| 286 |
|
|
@@ -347,12 +299,6 @@ def create_demo_interface():
|
|
| 347 |
# Pro Tips
|
| 348 |
with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
|
| 349 |
gr.Markdown("""
|
| 350 |
-
### Prompt Speech Enhancement|参考语音降噪
|
| 351 |
-
- **Enable** to remove background noise for a clean voice, with an external ZipEnhancer component. However, this will limit the audio sampling rate to 16kHz, restricting the cloning quality ceiling.
|
| 352 |
-
**启用**:通过 ZipEnhancer 组件消除背景噪音,但会将音频采样率限制在16kHz,限制克隆上限。
|
| 353 |
-
- **Disable** to preserve the original audio's all information, including background atmosphere, and support audio cloning up to 44.1kHz sampling rate.
|
| 354 |
-
**禁用**:保留原始音频的全部信息,包括背景环境声,最高支持44.1kHz的音频复刻。
|
| 355 |
-
|
| 356 |
### Text Normalization|文本正则化
|
| 357 |
- **Enable** to process general text with an external WeTextProcessing component.
|
| 358 |
**启用**:使用 WeTextProcessing 组件,可支持常见文本的正则化处理。
|
|
@@ -381,12 +327,6 @@ def create_demo_interface():
|
|
| 381 |
label="Prompt Speech (Optional, or let VoxCPM improvise)",
|
| 382 |
value="./examples/example.wav",
|
| 383 |
)
|
| 384 |
-
DoDenoisePromptAudio = gr.Checkbox(
|
| 385 |
-
value=False,
|
| 386 |
-
label="Prompt Speech Enhancement",
|
| 387 |
-
elem_id="chk_denoise",
|
| 388 |
-
info="We use ZipEnhancer model to denoise the prompt audio."
|
| 389 |
-
)
|
| 390 |
with gr.Row():
|
| 391 |
prompt_text = gr.Textbox(
|
| 392 |
value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
|
|
@@ -429,7 +369,7 @@ def create_demo_interface():
|
|
| 429 |
# Wiring
|
| 430 |
run_btn.click(
|
| 431 |
fn=generate_tts_audio,
|
| 432 |
-
inputs=[text, prompt_wav, prompt_text, cfg_value, inference_timesteps, DoNormalizeText
|
| 433 |
outputs=[audio_output],
|
| 434 |
show_progress=True,
|
| 435 |
api_name="generate",
|
|
|
|
| 42 |
_asr_model = None
|
| 43 |
_voxcpm_model = None
|
| 44 |
_default_local_model_dir = "./models/VoxCPM1.5"
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def predownload_models():
|
|
|
|
| 49 |
Pre-download models at startup (runs in main process, not GPU worker).
|
| 50 |
This ensures models are cached before GPU functions are called.
|
| 51 |
"""
|
|
|
|
|
|
|
| 52 |
print("=" * 50)
|
| 53 |
print("Pre-downloading models to cache...")
|
|
|
|
| 54 |
print(f"HF_HOME={os.environ.get('HF_HOME')}")
|
| 55 |
print("=" * 50)
|
| 56 |
|
| 57 |
+
# Pre-download ASR model (SenseVoice) from HuggingFace
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
try:
|
| 59 |
+
from huggingface_hub import snapshot_download
|
| 60 |
+
asr_model_id = "FunAudioLLM/SenseVoiceSmall"
|
| 61 |
print(f"Pre-downloading ASR model: {asr_model_id}")
|
| 62 |
+
asr_local_path = snapshot_download(
|
| 63 |
asr_model_id,
|
| 64 |
+
cache_dir=os.environ.get("HF_HOME"),
|
| 65 |
)
|
| 66 |
print(f"ASR model downloaded to: {asr_local_path}")
|
| 67 |
except Exception as e:
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
def get_asr_model():
|
| 106 |
+
"""Lazy load ASR model from HuggingFace."""
|
| 107 |
global _asr_model
|
| 108 |
if _asr_model is None:
|
|
|
|
| 109 |
setup_cache_env()
|
| 110 |
|
| 111 |
from funasr import AutoModel
|
| 112 |
print("Loading ASR model...")
|
| 113 |
+
print(f" HF_HOME={os.environ.get('HF_HOME')}")
|
| 114 |
_asr_model = AutoModel(
|
| 115 |
+
model="FunAudioLLM/SenseVoiceSmall", # HuggingFace model ID
|
| 116 |
+
hub="hf", # Use HuggingFace Hub
|
| 117 |
disable_update=True,
|
| 118 |
log_level='INFO',
|
| 119 |
device="cuda:0",
|
|
|
|
| 122 |
return _asr_model
|
| 123 |
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
def get_voxcpm_model():
|
| 126 |
+
"""Lazy load VoxCPM model (without denoiser)."""
|
| 127 |
global _voxcpm_model
|
| 128 |
if _voxcpm_model is None:
|
|
|
|
| 129 |
setup_cache_env()
|
| 130 |
|
| 131 |
import voxcpm
|
|
|
|
| 133 |
model_dir = _resolve_model_dir()
|
| 134 |
print(f"Using model dir: {model_dir}")
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
_voxcpm_model = voxcpm.VoxCPM(
|
| 137 |
voxcpm_model_path=model_dir,
|
| 138 |
optimize=True,
|
| 139 |
+
enable_denoiser=False, # Disable denoiser to avoid ZipEnhancer download
|
|
|
|
| 140 |
)
|
| 141 |
print("VoxCPM model loaded.")
|
| 142 |
return _voxcpm_model
|
|
|
|
| 161 |
cfg_value_input: float = 2.0,
|
| 162 |
inference_timesteps_input: int = 10,
|
| 163 |
do_normalize: bool = True,
|
|
|
|
| 164 |
) -> Tuple[int, np.ndarray]:
|
| 165 |
"""
|
| 166 |
GPU function: Generate speech from text using VoxCPM.
|
|
|
|
| 191 |
cfg_value=float(cfg_value_input),
|
| 192 |
inference_timesteps=int(inference_timesteps_input),
|
| 193 |
normalize=do_normalize,
|
| 194 |
+
denoise=False, # Denoiser disabled
|
| 195 |
)
|
| 196 |
return (voxcpm_model.tts_model.sample_rate, wav)
|
| 197 |
finally:
|
|
|
|
| 210 |
cfg_value_input: float = 2.0,
|
| 211 |
inference_timesteps_input: int = 10,
|
| 212 |
do_normalize: bool = True,
|
|
|
|
| 213 |
) -> Tuple[int, np.ndarray]:
|
| 214 |
"""
|
| 215 |
Wrapper: Read audio file in CPU, then call GPU function.
|
|
|
|
| 233 |
cfg_value_input=cfg_value_input,
|
| 234 |
inference_timesteps_input=inference_timesteps_input,
|
| 235 |
do_normalize=do_normalize,
|
|
|
|
| 236 |
)
|
| 237 |
|
| 238 |
|
|
|
|
| 299 |
# Pro Tips
|
| 300 |
with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
|
| 301 |
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
### Text Normalization|文本正则化
|
| 303 |
- **Enable** to process general text with an external WeTextProcessing component.
|
| 304 |
**启用**:使用 WeTextProcessing 组件,可支持常见文本的正则化处理。
|
|
|
|
| 327 |
label="Prompt Speech (Optional, or let VoxCPM improvise)",
|
| 328 |
value="./examples/example.wav",
|
| 329 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
with gr.Row():
|
| 331 |
prompt_text = gr.Textbox(
|
| 332 |
value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
|
|
|
|
| 369 |
# Wiring
|
| 370 |
run_btn.click(
|
| 371 |
fn=generate_tts_audio,
|
| 372 |
+
inputs=[text, prompt_wav, prompt_text, cfg_value, inference_timesteps, DoNormalizeText],
|
| 373 |
outputs=[audio_output],
|
| 374 |
show_progress=True,
|
| 375 |
api_name="generate",
|