Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 23 days ago

Commit

e43ceb5

verified ·

1 Parent(s): 380e75f

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -13

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from huggingface_hub import snapshot_download, hf_hub_download
 import subprocess
 import re
 import spaces
 # Create a global variable to track downloaded resources
 downloaded_resources = {
@@ -599,9 +600,9 @@ def vevo_style(content_wav, style_wav):
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
-    # Save audio
-    torchaudio.save(temp_content_path, content_tensor, content_sr, backend="soundfile")
-    torchaudio.save(temp_style_path, style_tensor, style_sr, backend="soundfile")
     try:
         # Get pipeline
@@ -694,9 +695,9 @@ def vevo_timbre(content_wav, reference_wav):
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
-    # Save uploaded audio
-    torchaudio.save(temp_content_path, content_tensor, content_sr, backend="soundfile")
-    torchaudio.save(temp_reference_path, reference_tensor, reference_sr, backend="soundfile")
     try:
         # Get pipeline
@@ -814,10 +815,10 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
     print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
     print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
-    # Save uploaded audio
-    torchaudio.save(temp_content_path, content_tensor, content_sr, backend="soundfile")
-    torchaudio.save(temp_style_path, style_tensor, style_sr, backend="soundfile")
-    torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr, backend="soundfile")
     try:
         # Get pipeline
@@ -887,8 +888,8 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
     if style_ref_text:
         print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
-    # Save uploaded audio
-    torchaudio.save(temp_ref_path, ref_tensor, ref_sr, backend="soundfile")
     if timbre_ref_wav is not None:
         if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
@@ -913,7 +914,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
             timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
             print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
-            torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr, backend="soundfile")
         else:
             raise ValueError("Invalid timbre reference audio format")
     else:

 import subprocess
 import re
 import spaces
+import soundfile as sf  # Importing soundfile directly
 # Create a global variable to track downloaded resources
 downloaded_resources = {
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
+    # Save audio DIRECTLY using soundfile (bypassing torchaudio to avoid TorchCodec error)
+    sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
+    sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
     try:
         # Get pipeline
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
+    # Save uploaded audio DIRECTLY using soundfile
+    sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
+    sf.write(temp_reference_path, reference_tensor.squeeze().cpu().numpy(), reference_sr)
     try:
         # Get pipeline
     print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
     print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
+    # Save uploaded audio DIRECTLY using soundfile
+    sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
+    sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
+    sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
     try:
         # Get pipeline
     if style_ref_text:
         print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
+    # Save uploaded audio DIRECTLY using soundfile
+    sf.write(temp_ref_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
     if timbre_ref_wav is not None:
         if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
             timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
             print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
+            sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
         else:
             raise ValueError("Invalid timbre reference audio format")
     else: