Spaces:

Luigi
/

ZipVoice-DEMO

Paused

App Files Files Community

Luigi commited on Sep 25

Commit

1b73690

1 Parent(s): 5df9ee2

Add Whisper transcription feature for automatic audio-to-text

Browse files

Files changed (2) hide show

app.py +47 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import gradio as gr
 import torch
 from pathlib import Path
 import spaces
 # Add current directory to Python path for local zipvoice package
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -29,6 +30,7 @@ _models_cache = {}
 _tokenizer_cache = None
 _vocoder_cache = None
 _feature_extractor_cache = None
 def load_models_and_components(model_name: str):
@@ -102,6 +104,36 @@ def load_models_and_components(model_name: str):
             model_config["feature"]["sampling_rate"])
 @spaces.GPU
 def synthesize_speech_gradio(
     text: str,
@@ -212,6 +244,9 @@ def create_gradio_interface():
         gr.HTML("""
         <div class="title">🎵 ZipVoice</div>
         <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
         """)
         with gr.Row():
@@ -250,6 +285,12 @@ def create_gradio_interface():
                     lines=2
                 )
                 generate_btn = gr.Button(
                     "🎵 Generate Speech",
                     variant="primary",
@@ -279,6 +320,12 @@ def create_gradio_interface():
                 )
         # Event handling
         generate_btn.click(
             fn=synthesize_speech_gradio,
             inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],

 import torch
 from pathlib import Path
 import spaces
+import whisper
 # Add current directory to Python path for local zipvoice package
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 _tokenizer_cache = None
 _vocoder_cache = None
 _feature_extractor_cache = None
+_whisper_model_cache = None
 def load_models_and_components(model_name: str):
             model_config["feature"]["sampling_rate"])
+def load_whisper_model():
+    """Load and cache Whisper model for transcription."""
+    global _whisper_model_cache
+    if _whisper_model_cache is None:
+        print("Loading Whisper model for transcription...")
+        # Use base model for faster transcription
+        _whisper_model_cache = whisper.load_model("base")
+    return _whisper_model_cache
+def transcribe_audio_whisper(audio_file):
+    """Transcribe audio file using Whisper."""
+    if audio_file is None:
+        return "Error: Please upload an audio file first."
+    try:
+        # Load Whisper model
+        model = load_whisper_model()
+        # Transcribe the audio
+        result = model.transcribe(audio_file, language="en")
+        return result["text"].strip()
+    except Exception as e:
+        return f"Error during transcription: {str(e)}"
 @spaces.GPU
 def synthesize_speech_gradio(
     text: str,
         gr.HTML("""
         <div class="title">🎵 ZipVoice</div>
         <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
+        <div style="text-align: center; color: #64748b; font-size: 0.9em; margin-bottom: 1em;">
+            Upload audio, click "Transcribe Audio" to get automatic transcription, then generate speech in that voice!
+        </div>
         """)
         with gr.Row():
                     lines=2
                 )
+                transcribe_btn = gr.Button(
+                    "🎤 Transcribe Audio",
+                    variant="secondary",
+                    size="sm"
+                )
                 generate_btn = gr.Button(
                     "🎵 Generate Speech",
                     variant="primary",
                 )
         # Event handling
+        transcribe_btn.click(
+            fn=transcribe_audio_whisper,
+            inputs=[prompt_audio],
+            outputs=[prompt_text]
+        )
         generate_btn.click(
             fn=synthesize_speech_gradio,
             inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],

requirements.txt CHANGED Viewed

@@ -11,6 +11,7 @@ vocos
 pydub
 gradio==5.47.0
 spaces
 # Normalization
 cn2an

 pydub
 gradio==5.47.0
 spaces
+openai-whisper
 # Normalization
 cn2an