Update app.py
Browse files
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from huggingface_hub import snapshot_download, hf_hub_download
|
|
| 11 |
import subprocess
|
| 12 |
import re
|
| 13 |
import spaces
|
|
|
|
| 14 |
|
| 15 |
# Create a global variable to track downloaded resources
|
| 16 |
downloaded_resources = {
|
|
@@ -599,9 +600,9 @@ def vevo_style(content_wav, style_wav):
|
|
| 599 |
print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
|
| 600 |
print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
|
| 601 |
|
| 602 |
-
# Save audio
|
| 603 |
-
|
| 604 |
-
|
| 605 |
|
| 606 |
try:
|
| 607 |
# Get pipeline
|
|
@@ -694,9 +695,9 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 694 |
print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
|
| 695 |
print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
|
| 696 |
|
| 697 |
-
# Save uploaded audio
|
| 698 |
-
|
| 699 |
-
|
| 700 |
|
| 701 |
try:
|
| 702 |
# Get pipeline
|
|
@@ -814,10 +815,10 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
|
| 814 |
print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
|
| 815 |
print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
|
| 816 |
|
| 817 |
-
# Save uploaded audio
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
|
| 822 |
try:
|
| 823 |
# Get pipeline
|
|
@@ -887,8 +888,8 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
|
|
| 887 |
if style_ref_text:
|
| 888 |
print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
|
| 889 |
|
| 890 |
-
# Save uploaded audio
|
| 891 |
-
|
| 892 |
|
| 893 |
if timbre_ref_wav is not None:
|
| 894 |
if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
|
|
@@ -913,7 +914,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
|
|
| 913 |
timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
|
| 914 |
|
| 915 |
print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
|
| 916 |
-
|
| 917 |
else:
|
| 918 |
raise ValueError("Invalid timbre reference audio format")
|
| 919 |
else:
|
|
|
|
| 11 |
import subprocess
|
| 12 |
import re
|
| 13 |
import spaces
|
| 14 |
+
import soundfile as sf # Importing soundfile directly
|
| 15 |
|
| 16 |
# Create a global variable to track downloaded resources
|
| 17 |
downloaded_resources = {
|
|
|
|
| 600 |
print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
|
| 601 |
print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
|
| 602 |
|
| 603 |
+
# Save audio DIRECTLY using soundfile (bypassing torchaudio to avoid TorchCodec error)
|
| 604 |
+
sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
|
| 605 |
+
sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
|
| 606 |
|
| 607 |
try:
|
| 608 |
# Get pipeline
|
|
|
|
| 695 |
print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
|
| 696 |
print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
|
| 697 |
|
| 698 |
+
# Save uploaded audio DIRECTLY using soundfile
|
| 699 |
+
sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
|
| 700 |
+
sf.write(temp_reference_path, reference_tensor.squeeze().cpu().numpy(), reference_sr)
|
| 701 |
|
| 702 |
try:
|
| 703 |
# Get pipeline
|
|
|
|
| 815 |
print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
|
| 816 |
print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
|
| 817 |
|
| 818 |
+
# Save uploaded audio DIRECTLY using soundfile
|
| 819 |
+
sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
|
| 820 |
+
sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
|
| 821 |
+
sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
|
| 822 |
|
| 823 |
try:
|
| 824 |
# Get pipeline
|
|
|
|
| 888 |
if style_ref_text:
|
| 889 |
print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
|
| 890 |
|
| 891 |
+
# Save uploaded audio DIRECTLY using soundfile
|
| 892 |
+
sf.write(temp_ref_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
|
| 893 |
|
| 894 |
if timbre_ref_wav is not None:
|
| 895 |
if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
|
|
|
|
| 914 |
timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
|
| 915 |
|
| 916 |
print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
|
| 917 |
+
sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
|
| 918 |
else:
|
| 919 |
raise ValueError("Invalid timbre reference audio format")
|
| 920 |
else:
|