Spaces:

AIDC-AI
/

Marco-Voice-TTS

Running

App Files Files Community

tianfengping.tfp commited on 14 days ago

Commit

a723d72

1 Parent(s): 1bd43c9

load model after init

Browse files

Files changed (1) hide show

app.py +54 -42

app.py CHANGED Viewed

@@ -36,27 +36,41 @@ os.system('export PYTHONPATH=third_party/Matcha-TTS')
 from huggingface_hub import hf_hub_download
-# Download assets and logos first (these are small files)
-try:
-    assets_dir = snapshot_download(
-        repo_id="tienfeng/prompt",
-        repo_type="dataset",
-    )
-    logo_path = hf_hub_download(
-        repo_id="tienfeng/prompt",
-        filename="logo2.png",
-        repo_type="dataset",
-    )
-    logo_path2 = hf_hub_download(
-        repo_id="tienfeng/prompt",
-        filename="logo.png",
-        repo_type="dataset",
-    )
-except Exception as e:
-    print(f"Warning: Failed to download assets/logos: {e}")
-    assets_dir = None
-    logo_path = None
-    logo_path2 = None
 # Delay model download to avoid blocking startup
 model_repo_id = "AIDC-AI/Marco-Voice"
@@ -157,16 +171,20 @@ os.makedirs("./tmp", exist_ok=True)
 def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     global tts_speakerminus_global, local_model_path
-    # Ensure models are downloaded
     if local_model_path is None:
         load_models()
     if 'tts_speakerminus_global' not in globals() or tts_speakerminus_global is None:
         print("Loading CosyVoice (speakerminus) model...")
         tts_speakerminus_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
     if not ref_audio and not ref_text:
-        if audio_prompt_path is None:
-            raise ValueError("Audio prompt path is not available. Please provide reference audio and text.")
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
@@ -241,15 +259,19 @@ def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_t
 def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     global tts_sft_global, local_model_path_enhenced
-    # Ensure models are downloaded
     if local_model_path_enhenced is None:
         load_models()
     if 'tts_sft_global' not in globals() or tts_sft_global is None:
         print("Loading CosyVoice (SFT enhanced) model...")
         tts_sft_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
     if not ref_audio and not ref_text:
-        if audio_prompt_path is None:
-            raise ValueError("Audio prompt path is not available. Please provide reference audio and text.")
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
@@ -638,7 +660,9 @@ input[type="text"]:focus, textarea:focus {
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     with gr.Column(elem_classes="header"):
         with gr.Row(elem_id="header-row", variant="compact"):
-            gr.Image(value=logo_path,
                      elem_id="logo-container",
                      show_label=False,
                      show_download_button=False,
@@ -823,20 +847,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
         outputs=tts_v2_output
     )
-def preload_models():
-    """Pre-download models to cache (non-blocking for launch)"""
-    import threading
-    def _download():
-        try:
-            print("Pre-downloading models to cache...")
-            load_models()
-            print("Model pre-download completed.")
-        except Exception as e:
-            print(f"Warning: Model pre-download failed: {e}. Models will be loaded on first use.")
-    threading.Thread(target=_download, daemon=True).start()
-# Start preloading models in background (non-blocking)
-preload_models()
 if __name__ == "__main__":
     # Use environment variable for port (Hugging Face Spaces uses 7860 by default)

 from huggingface_hub import hf_hub_download
+# Download assets and logos in background to avoid blocking startup
+assets_dir = None
+logo_path = None
+logo_path2 = None
+def load_assets():
+    """Load assets lazily"""
+    global assets_dir, logo_path, logo_path2
+    if assets_dir is None:
+        try:
+            print("Downloading assets and logos...")
+            assets_dir = snapshot_download(
+                repo_id="tienfeng/prompt",
+                repo_type="dataset",
+            )
+            logo_path = hf_hub_download(
+                repo_id="tienfeng/prompt",
+                filename="logo2.png",
+                repo_type="dataset",
+            )
+            logo_path2 = hf_hub_download(
+                repo_id="tienfeng/prompt",
+                filename="logo.png",
+                repo_type="dataset",
+            )
+            print("Assets downloaded successfully")
+        except Exception as e:
+            print(f"Warning: Failed to download assets/logos: {e}")
+            assets_dir = None
+            logo_path = None
+            logo_path2 = None
+# Start downloading assets in background (non-blocking)
+import threading
+threading.Thread(target=load_assets, daemon=True).start()
 # Delay model download to avoid blocking startup
 model_repo_id = "AIDC-AI/Marco-Voice"
 def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     global tts_speakerminus_global, local_model_path
+    # Ensure models are downloaded (this may take time on first use)
     if local_model_path is None:
+        print("Downloading models (this may take a few minutes on first use)...")
         load_models()
     if 'tts_speakerminus_global' not in globals() or tts_speakerminus_global is None:
         print("Loading CosyVoice (speakerminus) model...")
         tts_speakerminus_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path)
     if not ref_audio and not ref_text:
+        # Ensure assets are loaded
+        if assets_dir is None:
+            load_assets()
+        if audio_prompt_path is None or assets_dir is None:
+            raise ValueError("Audio prompt path is not available. Please wait a moment and try again, or provide reference audio and text.")
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
 def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text):
     # import pdb;pdb.set_trace()
     global tts_sft_global, local_model_path_enhenced
+    # Ensure models are downloaded (this may take time on first use)
     if local_model_path_enhenced is None:
+        print("Downloading models (this may take a few minutes on first use)...")
         load_models()
     if 'tts_sft_global' not in globals() or tts_sft_global is None:
         print("Loading CosyVoice (SFT enhanced) model...")
         tts_sft_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced)
     if not ref_audio and not ref_text:
+        # Ensure assets are loaded
+        if assets_dir is None:
+            load_assets()
+        if audio_prompt_path is None or assets_dir is None:
+            raise ValueError("Audio prompt path is not available. Please wait a moment and try again, or provide reference audio and text.")
         ref_text = text_prompt.get(speaker, "")
         speaker_audio_name = audio_prompt.get(speaker)
         if speaker_audio_name:
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     with gr.Column(elem_classes="header"):
         with gr.Row(elem_id="header-row", variant="compact"):
+            # Load logo if available, otherwise use placeholder
+            logo_value = logo_path if logo_path is not None else None
+            gr.Image(value=logo_value,
                      elem_id="logo-container",
                      show_label=False,
                      show_download_button=False,
         outputs=tts_v2_output
     )
+# Don't preload models - let them download on first use to avoid startup timeout
+# Models will be downloaded and loaded lazily when first requested by user
 if __name__ == "__main__":
     # Use environment variable for port (Hugging Face Spaces uses 7860 by default)