刘鑫 commited on
Commit
3c79c15
·
1 Parent(s): d75344d

set zero gpu inference

Browse files
Files changed (1) hide show
  1. app.py +13 -73
app.py CHANGED
@@ -42,7 +42,6 @@ if os.environ.get("HF_REPO_ID", "").strip() == "":
42
  _asr_model = None
43
  _voxcpm_model = None
44
  _default_local_model_dir = "./models/VoxCPM1.5"
45
- _zipenhancer_local_path = None # Will be set after pre-download
46
 
47
 
48
  def predownload_models():
@@ -50,36 +49,19 @@ def predownload_models():
50
  Pre-download models at startup (runs in main process, not GPU worker).
51
  This ensures models are cached before GPU functions are called.
52
  """
53
- global _zipenhancer_local_path
54
-
55
  print("=" * 50)
56
  print("Pre-downloading models to cache...")
57
- print(f"MODELSCOPE_CACHE={os.environ.get('MODELSCOPE_CACHE')}")
58
  print(f"HF_HOME={os.environ.get('HF_HOME')}")
59
  print("=" * 50)
60
 
61
- # Pre-download ZipEnhancer from ModelScope
62
- try:
63
- from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot_download
64
- zipenhancer_model_id = "iic/speech_zipenhancer_ans_multiloss_16k_base"
65
- print(f"Pre-downloading ZipEnhancer: {zipenhancer_model_id}")
66
- _zipenhancer_local_path = ms_snapshot_download(
67
- zipenhancer_model_id,
68
- cache_dir=os.environ.get("MODELSCOPE_CACHE"),
69
- )
70
- print(f"ZipEnhancer downloaded to: {_zipenhancer_local_path}")
71
- except Exception as e:
72
- print(f"Warning: Failed to pre-download ZipEnhancer: {e}")
73
- _zipenhancer_local_path = None
74
-
75
- # Pre-download ASR model (SenseVoice) from ModelScope
76
  try:
77
- from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot_download
78
- asr_model_id = "iic/SenseVoiceSmall"
79
  print(f"Pre-downloading ASR model: {asr_model_id}")
80
- asr_local_path = ms_snapshot_download(
81
  asr_model_id,
82
- cache_dir=os.environ.get("MODELSCOPE_CACHE"),
83
  )
84
  print(f"ASR model downloaded to: {asr_local_path}")
85
  except Exception as e:
@@ -121,18 +103,17 @@ def _resolve_model_dir() -> str:
121
 
122
 
123
  def get_asr_model():
124
- """Lazy load ASR model."""
125
  global _asr_model
126
  if _asr_model is None:
127
- # Setup cache env in GPU worker context
128
  setup_cache_env()
129
 
130
  from funasr import AutoModel
131
  print("Loading ASR model...")
132
- print(f" MODELSCOPE_CACHE={os.environ.get('MODELSCOPE_CACHE')}")
133
  _asr_model = AutoModel(
134
- model="iic/SenseVoiceSmall", # ModelScope model ID
135
- hub="ms", # Use ModelScope Hub
136
  disable_update=True,
137
  log_level='INFO',
138
  device="cuda:0",
@@ -141,31 +122,10 @@ def get_asr_model():
141
  return _asr_model
142
 
143
 
144
- def _get_zipenhancer_local_path():
145
- """
146
- Get ZipEnhancer local path from ModelScope cache.
147
- This works in both main process and GPU worker.
148
- """
149
- setup_cache_env()
150
- try:
151
- from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot_download
152
- zipenhancer_model_id = "iic/speech_zipenhancer_ans_multiloss_16k_base"
153
- # This will use cache if already downloaded
154
- local_path = ms_snapshot_download(
155
- zipenhancer_model_id,
156
- cache_dir=os.environ.get("MODELSCOPE_CACHE"),
157
- )
158
- return local_path
159
- except Exception as e:
160
- print(f"Warning: Failed to get ZipEnhancer path: {e}")
161
- return "iic/speech_zipenhancer_ans_multiloss_16k_base"
162
-
163
-
164
  def get_voxcpm_model():
165
- """Lazy load VoxCPM model."""
166
  global _voxcpm_model
167
  if _voxcpm_model is None:
168
- # Setup cache env in GPU worker context
169
  setup_cache_env()
170
 
171
  import voxcpm
@@ -173,15 +133,10 @@ def get_voxcpm_model():
173
  model_dir = _resolve_model_dir()
174
  print(f"Using model dir: {model_dir}")
175
 
176
- # Get ZipEnhancer local path (uses cache if pre-downloaded)
177
- zipenhancer_path = _get_zipenhancer_local_path()
178
- print(f"ZipEnhancer path: {zipenhancer_path}")
179
-
180
  _voxcpm_model = voxcpm.VoxCPM(
181
  voxcpm_model_path=model_dir,
182
  optimize=True,
183
- enable_denoiser=True,
184
- zipenhancer_model_path=zipenhancer_path,
185
  )
186
  print("VoxCPM model loaded.")
187
  return _voxcpm_model
@@ -206,7 +161,6 @@ def generate_tts_audio_gpu(
206
  cfg_value_input: float = 2.0,
207
  inference_timesteps_input: int = 10,
208
  do_normalize: bool = True,
209
- denoise: bool = True,
210
  ) -> Tuple[int, np.ndarray]:
211
  """
212
  GPU function: Generate speech from text using VoxCPM.
@@ -237,7 +191,7 @@ def generate_tts_audio_gpu(
237
  cfg_value=float(cfg_value_input),
238
  inference_timesteps=int(inference_timesteps_input),
239
  normalize=do_normalize,
240
- denoise=denoise,
241
  )
242
  return (voxcpm_model.tts_model.sample_rate, wav)
243
  finally:
@@ -256,7 +210,6 @@ def generate_tts_audio(
256
  cfg_value_input: float = 2.0,
257
  inference_timesteps_input: int = 10,
258
  do_normalize: bool = True,
259
- denoise: bool = True,
260
  ) -> Tuple[int, np.ndarray]:
261
  """
262
  Wrapper: Read audio file in CPU, then call GPU function.
@@ -280,7 +233,6 @@ def generate_tts_audio(
280
  cfg_value_input=cfg_value_input,
281
  inference_timesteps_input=inference_timesteps_input,
282
  do_normalize=do_normalize,
283
- denoise=denoise,
284
  )
285
 
286
 
@@ -347,12 +299,6 @@ def create_demo_interface():
347
  # Pro Tips
348
  with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
349
  gr.Markdown("""
350
- ### Prompt Speech Enhancement|参考语音降噪
351
- - **Enable** to remove background noise for a clean voice, with an external ZipEnhancer component. However, this will limit the audio sampling rate to 16kHz, restricting the cloning quality ceiling.
352
- **启用**:通过 ZipEnhancer 组件消除背景噪音,但会将音频采样率限制在16kHz,限制克隆上限。
353
- - **Disable** to preserve the original audio's all information, including background atmosphere, and support audio cloning up to 44.1kHz sampling rate.
354
- **禁用**:保留原始音频的全部信息,包括背景环境声,最高支持44.1kHz的音频复刻。
355
-
356
  ### Text Normalization|文本正则化
357
  - **Enable** to process general text with an external WeTextProcessing component.
358
  **启用**:使用 WeTextProcessing 组件,可支持常见文本的正则化处理。
@@ -381,12 +327,6 @@ def create_demo_interface():
381
  label="Prompt Speech (Optional, or let VoxCPM improvise)",
382
  value="./examples/example.wav",
383
  )
384
- DoDenoisePromptAudio = gr.Checkbox(
385
- value=False,
386
- label="Prompt Speech Enhancement",
387
- elem_id="chk_denoise",
388
- info="We use ZipEnhancer model to denoise the prompt audio."
389
- )
390
  with gr.Row():
391
  prompt_text = gr.Textbox(
392
  value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
@@ -429,7 +369,7 @@ def create_demo_interface():
429
  # Wiring
430
  run_btn.click(
431
  fn=generate_tts_audio,
432
- inputs=[text, prompt_wav, prompt_text, cfg_value, inference_timesteps, DoNormalizeText, DoDenoisePromptAudio],
433
  outputs=[audio_output],
434
  show_progress=True,
435
  api_name="generate",
 
42
  _asr_model = None
43
  _voxcpm_model = None
44
  _default_local_model_dir = "./models/VoxCPM1.5"
 
45
 
46
 
47
  def predownload_models():
 
49
  Pre-download models at startup (runs in main process, not GPU worker).
50
  This ensures models are cached before GPU functions are called.
51
  """
 
 
52
  print("=" * 50)
53
  print("Pre-downloading models to cache...")
 
54
  print(f"HF_HOME={os.environ.get('HF_HOME')}")
55
  print("=" * 50)
56
 
57
+ # Pre-download ASR model (SenseVoice) from HuggingFace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
59
+ from huggingface_hub import snapshot_download
60
+ asr_model_id = "FunAudioLLM/SenseVoiceSmall"
61
  print(f"Pre-downloading ASR model: {asr_model_id}")
62
+ asr_local_path = snapshot_download(
63
  asr_model_id,
64
+ cache_dir=os.environ.get("HF_HOME"),
65
  )
66
  print(f"ASR model downloaded to: {asr_local_path}")
67
  except Exception as e:
 
103
 
104
 
105
  def get_asr_model():
106
+ """Lazy load ASR model from HuggingFace."""
107
  global _asr_model
108
  if _asr_model is None:
 
109
  setup_cache_env()
110
 
111
  from funasr import AutoModel
112
  print("Loading ASR model...")
113
+ print(f" HF_HOME={os.environ.get('HF_HOME')}")
114
  _asr_model = AutoModel(
115
+ model="FunAudioLLM/SenseVoiceSmall", # HuggingFace model ID
116
+ hub="hf", # Use HuggingFace Hub
117
  disable_update=True,
118
  log_level='INFO',
119
  device="cuda:0",
 
122
  return _asr_model
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def get_voxcpm_model():
126
+ """Lazy load VoxCPM model (without denoiser)."""
127
  global _voxcpm_model
128
  if _voxcpm_model is None:
 
129
  setup_cache_env()
130
 
131
  import voxcpm
 
133
  model_dir = _resolve_model_dir()
134
  print(f"Using model dir: {model_dir}")
135
 
 
 
 
 
136
  _voxcpm_model = voxcpm.VoxCPM(
137
  voxcpm_model_path=model_dir,
138
  optimize=True,
139
+ enable_denoiser=False, # Disable denoiser to avoid ZipEnhancer download
 
140
  )
141
  print("VoxCPM model loaded.")
142
  return _voxcpm_model
 
161
  cfg_value_input: float = 2.0,
162
  inference_timesteps_input: int = 10,
163
  do_normalize: bool = True,
 
164
  ) -> Tuple[int, np.ndarray]:
165
  """
166
  GPU function: Generate speech from text using VoxCPM.
 
191
  cfg_value=float(cfg_value_input),
192
  inference_timesteps=int(inference_timesteps_input),
193
  normalize=do_normalize,
194
+ denoise=False, # Denoiser disabled
195
  )
196
  return (voxcpm_model.tts_model.sample_rate, wav)
197
  finally:
 
210
  cfg_value_input: float = 2.0,
211
  inference_timesteps_input: int = 10,
212
  do_normalize: bool = True,
 
213
  ) -> Tuple[int, np.ndarray]:
214
  """
215
  Wrapper: Read audio file in CPU, then call GPU function.
 
233
  cfg_value_input=cfg_value_input,
234
  inference_timesteps_input=inference_timesteps_input,
235
  do_normalize=do_normalize,
 
236
  )
237
 
238
 
 
299
  # Pro Tips
300
  with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
301
  gr.Markdown("""
 
 
 
 
 
 
302
  ### Text Normalization|文本正则化
303
  - **Enable** to process general text with an external WeTextProcessing component.
304
  **启用**:使用 WeTextProcessing 组件,可支持常见文本的正则化处理。
 
327
  label="Prompt Speech (Optional, or let VoxCPM improvise)",
328
  value="./examples/example.wav",
329
  )
 
 
 
 
 
 
330
  with gr.Row():
331
  prompt_text = gr.Textbox(
332
  value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
 
369
  # Wiring
370
  run_btn.click(
371
  fn=generate_tts_audio,
372
+ inputs=[text, prompt_wav, prompt_text, cfg_value, inference_timesteps, DoNormalizeText],
373
  outputs=[audio_output],
374
  show_progress=True,
375
  api_name="generate",