Opera8 commited on
Commit
e43ceb5
·
verified ·
1 Parent(s): 380e75f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -13
app.py CHANGED
@@ -11,6 +11,7 @@ from huggingface_hub import snapshot_download, hf_hub_download
11
  import subprocess
12
  import re
13
  import spaces
 
14
 
15
  # Create a global variable to track downloaded resources
16
  downloaded_resources = {
@@ -599,9 +600,9 @@ def vevo_style(content_wav, style_wav):
599
  print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
600
  print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
601
 
602
- # Save audio
603
- torchaudio.save(temp_content_path, content_tensor, content_sr, backend="soundfile")
604
- torchaudio.save(temp_style_path, style_tensor, style_sr, backend="soundfile")
605
 
606
  try:
607
  # Get pipeline
@@ -694,9 +695,9 @@ def vevo_timbre(content_wav, reference_wav):
694
  print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
695
  print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
696
 
697
- # Save uploaded audio
698
- torchaudio.save(temp_content_path, content_tensor, content_sr, backend="soundfile")
699
- torchaudio.save(temp_reference_path, reference_tensor, reference_sr, backend="soundfile")
700
 
701
  try:
702
  # Get pipeline
@@ -814,10 +815,10 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
814
  print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
815
  print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
816
 
817
- # Save uploaded audio
818
- torchaudio.save(temp_content_path, content_tensor, content_sr, backend="soundfile")
819
- torchaudio.save(temp_style_path, style_tensor, style_sr, backend="soundfile")
820
- torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr, backend="soundfile")
821
 
822
  try:
823
  # Get pipeline
@@ -887,8 +888,8 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
887
  if style_ref_text:
888
  print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
889
 
890
- # Save uploaded audio
891
- torchaudio.save(temp_ref_path, ref_tensor, ref_sr, backend="soundfile")
892
 
893
  if timbre_ref_wav is not None:
894
  if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
@@ -913,7 +914,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
913
  timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
914
 
915
  print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
916
- torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr, backend="soundfile")
917
  else:
918
  raise ValueError("Invalid timbre reference audio format")
919
  else:
 
11
  import subprocess
12
  import re
13
  import spaces
14
+ import soundfile as sf # Importing soundfile directly
15
 
16
  # Create a global variable to track downloaded resources
17
  downloaded_resources = {
 
600
  print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
601
  print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
602
 
603
+ # Save audio DIRECTLY using soundfile (bypassing torchaudio to avoid TorchCodec error)
604
+ sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
605
+ sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
606
 
607
  try:
608
  # Get pipeline
 
695
  print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
696
  print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
697
 
698
+ # Save uploaded audio DIRECTLY using soundfile
699
+ sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
700
+ sf.write(temp_reference_path, reference_tensor.squeeze().cpu().numpy(), reference_sr)
701
 
702
  try:
703
  # Get pipeline
 
815
  print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
816
  print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
817
 
818
+ # Save uploaded audio DIRECTLY using soundfile
819
+ sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
820
+ sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
821
+ sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
822
 
823
  try:
824
  # Get pipeline
 
888
  if style_ref_text:
889
  print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
890
 
891
+ # Save uploaded audio DIRECTLY using soundfile
892
+ sf.write(temp_ref_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
893
 
894
  if timbre_ref_wav is not None:
895
  if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
 
914
  timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
915
 
916
  print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
917
+ sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
918
  else:
919
  raise ValueError("Invalid timbre reference audio format")
920
  else: