Spaces:
Running
Running
phucbienvan
commited on
Commit
·
633ab26
1
Parent(s):
e02c9de
fix zero gpu
Browse files- .gitattributes +0 -0
- .gitignore +0 -0
- README.md +0 -0
- app.py +58 -50
- generator.py +1 -0
- hf_requirements.txt +0 -0
- models.py +0 -0
- requirements.txt +1 -1
- test_model.py +0 -0
- watermarking.py +0 -0
.gitattributes
CHANGED
|
File without changes
|
.gitignore
CHANGED
|
File without changes
|
README.md
CHANGED
|
File without changes
|
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from dataclasses import dataclass
|
|
| 11 |
from generator import Segment, load_csm_1b
|
| 12 |
from huggingface_hub import login
|
| 13 |
|
|
|
|
| 14 |
# Disable torch compile feature to avoid triton error
|
| 15 |
torch._dynamo.config.suppress_errors = True
|
| 16 |
|
|
@@ -36,7 +37,7 @@ generator = None
|
|
| 36 |
model_loaded = False
|
| 37 |
|
| 38 |
# Function to load model in ZeroGPU
|
| 39 |
-
@spaces.GPU(duration=30)
|
| 40 |
def initialize_model():
|
| 41 |
global generator, model_loaded
|
| 42 |
if not model_loaded:
|
|
@@ -47,7 +48,7 @@ def initialize_model():
|
|
| 47 |
return generator
|
| 48 |
|
| 49 |
# Function to get the loaded model
|
| 50 |
-
@spaces.GPU(duration=30)
|
| 51 |
def get_model():
|
| 52 |
global generator, model_loaded
|
| 53 |
if not model_loaded:
|
|
@@ -80,13 +81,13 @@ def audio_to_tensor(audio_path: str) -> Tuple[torch.Tensor, int]:
|
|
| 80 |
|
| 81 |
# Function to save audio tensor to file
|
| 82 |
def save_audio(audio_tensor: torch.Tensor, sample_rate: int) -> str:
|
| 83 |
-
|
| 84 |
-
output_path =
|
| 85 |
torchaudio.save(output_path, audio_tensor.unsqueeze(0), sample_rate)
|
| 86 |
return output_path
|
| 87 |
|
| 88 |
# Function to generate speech from text using ZeroGPU
|
| 89 |
-
@spaces.GPU(duration=30)
|
| 90 |
def generate_speech(
|
| 91 |
text: str,
|
| 92 |
speaker_id: int,
|
|
@@ -132,13 +133,14 @@ def generate_speech(
|
|
| 132 |
speaker=speaker_id,
|
| 133 |
context=context,
|
| 134 |
max_audio_length_ms=max_duration_ms,
|
| 135 |
-
temperature=temperature,
|
| 136 |
-
topk=top_k
|
| 137 |
)
|
| 138 |
|
| 139 |
progress(0.8, "Saving audio...")
|
| 140 |
# Save audio to file
|
| 141 |
-
output_path = save_audio(audio, generator.sample_rate)
|
|
|
|
| 142 |
|
| 143 |
progress(1.0, "Completed!")
|
| 144 |
return output_path
|
|
@@ -156,7 +158,7 @@ def generate_speech(
|
|
| 156 |
return f"Error generating speech: {str(e)}"
|
| 157 |
|
| 158 |
# Function to generate simple speech without context
|
| 159 |
-
@spaces.GPU(duration=30)
|
| 160 |
def generate_speech_simple(
|
| 161 |
text: str,
|
| 162 |
speaker_id: int,
|
|
@@ -176,17 +178,23 @@ def generate_speech_simple(
|
|
| 176 |
speaker=speaker_id,
|
| 177 |
context=[], # No context
|
| 178 |
max_audio_length_ms=max_duration_ms,
|
| 179 |
-
temperature=temperature,
|
| 180 |
-
topk=top_k
|
| 181 |
)
|
| 182 |
|
| 183 |
progress(0.8, "Saving audio...")
|
| 184 |
# Save audio to file
|
| 185 |
-
output_path = save_audio(audio, generator.sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
progress(1.0, "Completed!")
|
| 188 |
return output_path
|
| 189 |
-
except
|
| 190 |
# Handle ZeroGPU quota exceeded error
|
| 191 |
error_message = str(e)
|
| 192 |
if "GPU quota exceeded" in error_message:
|
|
@@ -229,25 +237,25 @@ def create_demo():
|
|
| 229 |
value=30000,
|
| 230 |
step=1000
|
| 231 |
)
|
| 232 |
-
temperature = gr.Slider(
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
)
|
| 239 |
-
top_k = gr.Slider(
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
)
|
| 246 |
|
| 247 |
generate_btn = gr.Button("Generate Audio")
|
| 248 |
|
| 249 |
with gr.Column():
|
| 250 |
-
output_audio = gr.Audio(label="Output Audio", type="filepath")
|
| 251 |
|
| 252 |
with gr.Tab("Audio Generation with Context"):
|
| 253 |
gr.Markdown("This feature allows you to provide audio clips and text as context to help the model generate more appropriate speech.")
|
|
@@ -281,25 +289,25 @@ def create_demo():
|
|
| 281 |
value=30000,
|
| 282 |
step=1000
|
| 283 |
)
|
| 284 |
-
temperature_context = gr.Slider(
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
)
|
| 291 |
-
top_k_context = gr.Slider(
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
)
|
| 298 |
|
| 299 |
generate_context_btn = gr.Button("Generate Audio with Context")
|
| 300 |
|
| 301 |
with gr.Column():
|
| 302 |
-
output_audio_context = gr.Audio(label="Output Audio", type="filepath")
|
| 303 |
|
| 304 |
# Add Hugging Face configuration tab
|
| 305 |
with gr.Tab("Configuration"):
|
|
@@ -357,7 +365,7 @@ def create_demo():
|
|
| 357 |
If you encounter a "GPU quota exceeded" error, please wait for the specified time and try again.
|
| 358 |
""")
|
| 359 |
|
| 360 |
-
@spaces.GPU(duration=10)
|
| 361 |
def check_gpu():
|
| 362 |
if torch.cuda.is_available():
|
| 363 |
gpu_name = torch.cuda.get_device_name(0)
|
|
@@ -375,7 +383,7 @@ def create_demo():
|
|
| 375 |
load_model_btn = gr.Button("Load Model")
|
| 376 |
model_status = gr.Textbox(label="Model Status", interactive=False)
|
| 377 |
|
| 378 |
-
@spaces.GPU(duration=10)
|
| 379 |
def load_model_and_report():
|
| 380 |
global model_loaded
|
| 381 |
if model_loaded:
|
|
@@ -393,8 +401,8 @@ def create_demo():
|
|
| 393 |
text_input,
|
| 394 |
speaker_id,
|
| 395 |
max_duration,
|
| 396 |
-
temperature,
|
| 397 |
-
top_k
|
| 398 |
],
|
| 399 |
outputs=output_audio
|
| 400 |
)
|
|
@@ -411,8 +419,8 @@ def create_demo():
|
|
| 411 |
context_text2,
|
| 412 |
context_speaker2,
|
| 413 |
max_duration_context,
|
| 414 |
-
temperature_context,
|
| 415 |
-
top_k_context
|
| 416 |
],
|
| 417 |
outputs=output_audio_context
|
| 418 |
)
|
|
@@ -422,4 +430,4 @@ def create_demo():
|
|
| 422 |
# Launch the application
|
| 423 |
if __name__ == "__main__":
|
| 424 |
demo = create_demo()
|
| 425 |
-
demo.queue().launch()
|
|
|
|
| 11 |
from generator import Segment, load_csm_1b
|
| 12 |
from huggingface_hub import login
|
| 13 |
|
| 14 |
+
|
| 15 |
# Disable torch compile feature to avoid triton error
|
| 16 |
torch._dynamo.config.suppress_errors = True
|
| 17 |
|
|
|
|
| 37 |
model_loaded = False
|
| 38 |
|
| 39 |
# Function to load model in ZeroGPU
|
| 40 |
+
# @spaces.GPU(duration=30)
|
| 41 |
def initialize_model():
|
| 42 |
global generator, model_loaded
|
| 43 |
if not model_loaded:
|
|
|
|
| 48 |
return generator
|
| 49 |
|
| 50 |
# Function to get the loaded model
|
| 51 |
+
# @spaces.GPU(duration=30)
|
| 52 |
def get_model():
|
| 53 |
global generator, model_loaded
|
| 54 |
if not model_loaded:
|
|
|
|
| 81 |
|
| 82 |
# Function to save audio tensor to file
|
| 83 |
def save_audio(audio_tensor: torch.Tensor, sample_rate: int) -> str:
|
| 84 |
+
# Lưu file vào thư mục hiện tại hoặc thư mục files mà Gradio mặc định sử dụng
|
| 85 |
+
output_path = f"csm1b_output_{int(time.time())}.wav"
|
| 86 |
torchaudio.save(output_path, audio_tensor.unsqueeze(0), sample_rate)
|
| 87 |
return output_path
|
| 88 |
|
| 89 |
# Function to generate speech from text using ZeroGPU
|
| 90 |
+
# @spaces.GPU(duration=30)
|
| 91 |
def generate_speech(
|
| 92 |
text: str,
|
| 93 |
speaker_id: int,
|
|
|
|
| 133 |
speaker=speaker_id,
|
| 134 |
context=context,
|
| 135 |
max_audio_length_ms=max_duration_ms,
|
| 136 |
+
# temperature=temperature,
|
| 137 |
+
# topk=top_k
|
| 138 |
)
|
| 139 |
|
| 140 |
progress(0.8, "Saving audio...")
|
| 141 |
# Save audio to file
|
| 142 |
+
# output_path = save_audio(audio, generator.sample_rate)
|
| 143 |
+
output_path = f"csm1b_output_{int(time.time())}.wav"
|
| 144 |
|
| 145 |
progress(1.0, "Completed!")
|
| 146 |
return output_path
|
|
|
|
| 158 |
return f"Error generating speech: {str(e)}"
|
| 159 |
|
| 160 |
# Function to generate simple speech without context
|
| 161 |
+
# @spaces.GPU(duration=30)
|
| 162 |
def generate_speech_simple(
|
| 163 |
text: str,
|
| 164 |
speaker_id: int,
|
|
|
|
| 178 |
speaker=speaker_id,
|
| 179 |
context=[], # No context
|
| 180 |
max_audio_length_ms=max_duration_ms,
|
| 181 |
+
# temperature=temperature,
|
| 182 |
+
# topk=top_k
|
| 183 |
)
|
| 184 |
|
| 185 |
progress(0.8, "Saving audio...")
|
| 186 |
# Save audio to file
|
| 187 |
+
# output_path = save_audio(audio, generator.sample_rate)
|
| 188 |
+
output_path = f"csm1b_output_{int(time.time())}.wav"
|
| 189 |
+
torchaudio.save(output_path, audio.unsqueeze(0).cpu(), generator.sample_rate)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
print(f"Audio saved to {output_path}")
|
| 194 |
|
| 195 |
progress(1.0, "Completed!")
|
| 196 |
return output_path
|
| 197 |
+
except Exception as e:
|
| 198 |
# Handle ZeroGPU quota exceeded error
|
| 199 |
error_message = str(e)
|
| 200 |
if "GPU quota exceeded" in error_message:
|
|
|
|
| 237 |
value=30000,
|
| 238 |
step=1000
|
| 239 |
)
|
| 240 |
+
# temperature = gr.Slider(
|
| 241 |
+
# label="Temperature",
|
| 242 |
+
# minimum=0.1,
|
| 243 |
+
# maximum=1.5,
|
| 244 |
+
# value=0.9,
|
| 245 |
+
# step=0.1
|
| 246 |
+
# )
|
| 247 |
+
# top_k = gr.Slider(
|
| 248 |
+
# label="Top-K",
|
| 249 |
+
# minimum=1,
|
| 250 |
+
# maximum=100,
|
| 251 |
+
# value=50,
|
| 252 |
+
# step=1
|
| 253 |
+
# )
|
| 254 |
|
| 255 |
generate_btn = gr.Button("Generate Audio")
|
| 256 |
|
| 257 |
with gr.Column():
|
| 258 |
+
output_audio = gr.Audio(label="Output Audio", type="filepath", autoplay=True)
|
| 259 |
|
| 260 |
with gr.Tab("Audio Generation with Context"):
|
| 261 |
gr.Markdown("This feature allows you to provide audio clips and text as context to help the model generate more appropriate speech.")
|
|
|
|
| 289 |
value=30000,
|
| 290 |
step=1000
|
| 291 |
)
|
| 292 |
+
# temperature_context = gr.Slider(
|
| 293 |
+
# label="Temperature",
|
| 294 |
+
# minimum=0.1,
|
| 295 |
+
# maximum=1.5,
|
| 296 |
+
# value=0.9,
|
| 297 |
+
# step=0.1
|
| 298 |
+
# )
|
| 299 |
+
# top_k_context = gr.Slider(
|
| 300 |
+
# label="Top-K",
|
| 301 |
+
# minimum=1,
|
| 302 |
+
# maximum=100,
|
| 303 |
+
# value=50,
|
| 304 |
+
# step=1
|
| 305 |
+
# )
|
| 306 |
|
| 307 |
generate_context_btn = gr.Button("Generate Audio with Context")
|
| 308 |
|
| 309 |
with gr.Column():
|
| 310 |
+
output_audio_context = gr.Audio(label="Output Audio", type="filepath", autoplay=True)
|
| 311 |
|
| 312 |
# Add Hugging Face configuration tab
|
| 313 |
with gr.Tab("Configuration"):
|
|
|
|
| 365 |
If you encounter a "GPU quota exceeded" error, please wait for the specified time and try again.
|
| 366 |
""")
|
| 367 |
|
| 368 |
+
# @spaces.GPU(duration=10)
|
| 369 |
def check_gpu():
|
| 370 |
if torch.cuda.is_available():
|
| 371 |
gpu_name = torch.cuda.get_device_name(0)
|
|
|
|
| 383 |
load_model_btn = gr.Button("Load Model")
|
| 384 |
model_status = gr.Textbox(label="Model Status", interactive=False)
|
| 385 |
|
| 386 |
+
# @spaces.GPU(duration=10)
|
| 387 |
def load_model_and_report():
|
| 388 |
global model_loaded
|
| 389 |
if model_loaded:
|
|
|
|
| 401 |
text_input,
|
| 402 |
speaker_id,
|
| 403 |
max_duration,
|
| 404 |
+
# temperature,
|
| 405 |
+
# top_k
|
| 406 |
],
|
| 407 |
outputs=output_audio
|
| 408 |
)
|
|
|
|
| 419 |
context_text2,
|
| 420 |
context_speaker2,
|
| 421 |
max_duration_context,
|
| 422 |
+
# temperature_context,
|
| 423 |
+
# top_k_context
|
| 424 |
],
|
| 425 |
outputs=output_audio_context
|
| 426 |
)
|
|
|
|
| 430 |
# Launch the application
|
| 431 |
if __name__ == "__main__":
|
| 432 |
demo = create_demo()
|
| 433 |
+
demo.queue().launch(share=True)
|
generator.py
CHANGED
|
@@ -178,6 +178,7 @@ def load_csm_1b(device: str = "cuda") -> Generator:
|
|
| 178 |
try:
|
| 179 |
# In ZeroGPU, CUDA should not be initialized in the main process
|
| 180 |
# Only move the model to GPU when called in a function with the @spaces.GPU decorator
|
|
|
|
| 181 |
if 'cuda' in device and not torch.cuda.is_initialized():
|
| 182 |
# Use CPU for the main process
|
| 183 |
model = Model.from_pretrained("sesame/csm-1b")
|
|
|
|
| 178 |
try:
|
| 179 |
# In ZeroGPU, CUDA should not be initialized in the main process
|
| 180 |
# Only move the model to GPU when called in a function with the @spaces.GPU decorator
|
| 181 |
+
print(f"Loading model on {device}")
|
| 182 |
if 'cuda' in device and not torch.cuda.is_initialized():
|
| 183 |
# Use CPU for the main process
|
| 184 |
model = Model.from_pretrained("sesame/csm-1b")
|
hf_requirements.txt
CHANGED
|
File without changes
|
models.py
CHANGED
|
File without changes
|
requirements.txt
CHANGED
|
@@ -3,7 +3,7 @@ torchaudio==2.4.0
|
|
| 3 |
tokenizers==0.21.0
|
| 4 |
transformers==4.49.0
|
| 5 |
huggingface_hub==0.28.1
|
| 6 |
-
moshi==0.2.2
|
| 7 |
torchtune==0.4.0
|
| 8 |
torchao==0.9.0
|
| 9 |
silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master
|
|
|
|
| 3 |
tokenizers==0.21.0
|
| 4 |
transformers==4.49.0
|
| 5 |
huggingface_hub==0.28.1
|
| 6 |
+
# moshi==0.2.2
|
| 7 |
torchtune==0.4.0
|
| 8 |
torchao==0.9.0
|
| 9 |
silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master
|
test_model.py
CHANGED
|
File without changes
|
watermarking.py
CHANGED
|
File without changes
|