Spaces:
Running
on
Zero
Running
on
Zero
刘鑫
commited on
Commit
·
f6f653d
1
Parent(s):
1700cda
set zero gpu inference
Browse files
app.py
CHANGED
|
@@ -8,6 +8,13 @@ from pathlib import Path
|
|
| 8 |
import tempfile
|
| 9 |
import soundfile as sf
|
| 10 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def setup_cache_env():
|
|
@@ -109,9 +116,8 @@ def get_asr_model():
|
|
| 109 |
global _asr_model
|
| 110 |
if _asr_model is None:
|
| 111 |
from funasr import AutoModel
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
print(f" Using local path: {ASR_LOCAL_DIR}")
|
| 115 |
start_time = time.time()
|
| 116 |
_asr_model = AutoModel(
|
| 117 |
model=ASR_LOCAL_DIR, # Use local directory path
|
|
@@ -120,8 +126,8 @@ def get_asr_model():
|
|
| 120 |
device="cuda:0",
|
| 121 |
)
|
| 122 |
load_time = time.time() - start_time
|
| 123 |
-
|
| 124 |
-
|
| 125 |
return _asr_model
|
| 126 |
|
| 127 |
|
|
@@ -130,9 +136,8 @@ def get_voxcpm_model():
|
|
| 130 |
global _voxcpm_model
|
| 131 |
if _voxcpm_model is None:
|
| 132 |
import voxcpm
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
print(f" Using local path: {VOXCPM_LOCAL_DIR}")
|
| 136 |
start_time = time.time()
|
| 137 |
_voxcpm_model = voxcpm.VoxCPM(
|
| 138 |
voxcpm_model_path=VOXCPM_LOCAL_DIR,
|
|
@@ -140,8 +145,8 @@ def get_voxcpm_model():
|
|
| 140 |
enable_denoiser=False, # Disable denoiser to avoid ZipEnhancer download
|
| 141 |
)
|
| 142 |
load_time = time.time() - start_time
|
| 143 |
-
|
| 144 |
-
|
| 145 |
return _voxcpm_model
|
| 146 |
|
| 147 |
|
|
@@ -150,16 +155,16 @@ def prompt_wav_recognition(prompt_wav: Optional[str]) -> str:
|
|
| 150 |
"""Use ASR to recognize prompt audio text."""
|
| 151 |
if prompt_wav is None or not prompt_wav.strip():
|
| 152 |
return ""
|
| 153 |
-
|
| 154 |
-
|
| 155 |
asr_model = get_asr_model()
|
| 156 |
start_time = time.time()
|
| 157 |
res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
|
| 158 |
inference_time = time.time() - start_time
|
| 159 |
text = res[0]["text"].split('|>')[-1]
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
return text
|
| 164 |
|
| 165 |
|
|
@@ -193,9 +198,9 @@ def generate_tts_audio_gpu(
|
|
| 193 |
prompt_wav_path = f.name
|
| 194 |
|
| 195 |
try:
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
start_time = time.time()
|
| 200 |
wav = voxcpm_model.generate(
|
| 201 |
text=text,
|
|
@@ -209,8 +214,8 @@ def generate_tts_audio_gpu(
|
|
| 209 |
inference_time = time.time() - start_time
|
| 210 |
audio_duration = len(wav) / voxcpm_model.tts_model.sample_rate
|
| 211 |
rtf = inference_time / audio_duration if audio_duration > 0 else 0
|
| 212 |
-
|
| 213 |
-
|
| 214 |
return (voxcpm_model.tts_model.sample_rate, wav)
|
| 215 |
finally:
|
| 216 |
# Cleanup temp file
|
|
|
|
| 8 |
import tempfile
|
| 9 |
import soundfile as sf
|
| 10 |
import time
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def log(msg: str):
|
| 15 |
+
"""打印带时间戳的日志"""
|
| 16 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 17 |
+
print(f"[{timestamp}] {msg}")
|
| 18 |
|
| 19 |
|
| 20 |
def setup_cache_env():
|
|
|
|
| 116 |
global _asr_model
|
| 117 |
if _asr_model is None:
|
| 118 |
from funasr import AutoModel
|
| 119 |
+
log("=" * 50)
|
| 120 |
+
log(f"Loading ASR model from: {ASR_LOCAL_DIR}")
|
|
|
|
| 121 |
start_time = time.time()
|
| 122 |
_asr_model = AutoModel(
|
| 123 |
model=ASR_LOCAL_DIR, # Use local directory path
|
|
|
|
| 126 |
device="cuda:0",
|
| 127 |
)
|
| 128 |
load_time = time.time() - start_time
|
| 129 |
+
log(f"ASR model loaded. (耗时: {load_time:.2f}s)")
|
| 130 |
+
log("=" * 50)
|
| 131 |
return _asr_model
|
| 132 |
|
| 133 |
|
|
|
|
| 136 |
global _voxcpm_model
|
| 137 |
if _voxcpm_model is None:
|
| 138 |
import voxcpm
|
| 139 |
+
log("=" * 50)
|
| 140 |
+
log(f"Loading VoxCPM model from: {VOXCPM_LOCAL_DIR}")
|
|
|
|
| 141 |
start_time = time.time()
|
| 142 |
_voxcpm_model = voxcpm.VoxCPM(
|
| 143 |
voxcpm_model_path=VOXCPM_LOCAL_DIR,
|
|
|
|
| 145 |
enable_denoiser=False, # Disable denoiser to avoid ZipEnhancer download
|
| 146 |
)
|
| 147 |
load_time = time.time() - start_time
|
| 148 |
+
log(f"VoxCPM model loaded. (耗时: {load_time:.2f}s)")
|
| 149 |
+
log("=" * 50)
|
| 150 |
return _voxcpm_model
|
| 151 |
|
| 152 |
|
|
|
|
| 155 |
"""Use ASR to recognize prompt audio text."""
|
| 156 |
if prompt_wav is None or not prompt_wav.strip():
|
| 157 |
return ""
|
| 158 |
+
log("=" * 50)
|
| 159 |
+
log("[ASR] 开始语音识别...")
|
| 160 |
asr_model = get_asr_model()
|
| 161 |
start_time = time.time()
|
| 162 |
res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
|
| 163 |
inference_time = time.time() - start_time
|
| 164 |
text = res[0]["text"].split('|>')[-1]
|
| 165 |
+
log(f"[ASR] 识别结果: {text}")
|
| 166 |
+
log(f"[ASR] 推理耗时: {inference_time:.2f}s")
|
| 167 |
+
log("=" * 50)
|
| 168 |
return text
|
| 169 |
|
| 170 |
|
|
|
|
| 198 |
prompt_wav_path = f.name
|
| 199 |
|
| 200 |
try:
|
| 201 |
+
log("=" * 50)
|
| 202 |
+
log("[TTS] 开始语音合成...")
|
| 203 |
+
log(f"[TTS] 目标文本: {text}")
|
| 204 |
start_time = time.time()
|
| 205 |
wav = voxcpm_model.generate(
|
| 206 |
text=text,
|
|
|
|
| 214 |
inference_time = time.time() - start_time
|
| 215 |
audio_duration = len(wav) / voxcpm_model.tts_model.sample_rate
|
| 216 |
rtf = inference_time / audio_duration if audio_duration > 0 else 0
|
| 217 |
+
log(f"[TTS] 推理耗时: {inference_time:.2f}s | 音频时长: {audio_duration:.2f}s | RTF: {rtf:.3f}")
|
| 218 |
+
log("=" * 50)
|
| 219 |
return (voxcpm_model.tts_model.sample_rate, wav)
|
| 220 |
finally:
|
| 221 |
# Cleanup temp file
|