Luigi commited on
Commit
1b73690
Β·
1 Parent(s): 5df9ee2

Add Whisper transcription feature for automatic audio-to-text

Browse files
Files changed (2) hide show
  1. app.py +47 -0
  2. requirements.txt +1 -0
app.py CHANGED
@@ -11,6 +11,7 @@ import gradio as gr
11
  import torch
12
  from pathlib import Path
13
  import spaces
 
14
 
15
  # Add current directory to Python path for local zipvoice package
16
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -29,6 +30,7 @@ _models_cache = {}
29
  _tokenizer_cache = None
30
  _vocoder_cache = None
31
  _feature_extractor_cache = None
 
32
 
33
 
34
  def load_models_and_components(model_name: str):
@@ -102,6 +104,36 @@ def load_models_and_components(model_name: str):
102
  model_config["feature"]["sampling_rate"])
103
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  @spaces.GPU
106
  def synthesize_speech_gradio(
107
  text: str,
@@ -212,6 +244,9 @@ def create_gradio_interface():
212
  gr.HTML("""
213
  <div class="title">🎡 ZipVoice</div>
214
  <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
 
 
 
215
  """)
216
 
217
  with gr.Row():
@@ -250,6 +285,12 @@ def create_gradio_interface():
250
  lines=2
251
  )
252
 
 
 
 
 
 
 
253
  generate_btn = gr.Button(
254
  "🎡 Generate Speech",
255
  variant="primary",
@@ -279,6 +320,12 @@ def create_gradio_interface():
279
  )
280
 
281
  # Event handling
 
 
 
 
 
 
282
  generate_btn.click(
283
  fn=synthesize_speech_gradio,
284
  inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
 
11
  import torch
12
  from pathlib import Path
13
  import spaces
14
+ import whisper
15
 
16
  # Add current directory to Python path for local zipvoice package
17
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
30
  _tokenizer_cache = None
31
  _vocoder_cache = None
32
  _feature_extractor_cache = None
33
+ _whisper_model_cache = None
34
 
35
 
36
  def load_models_and_components(model_name: str):
 
104
  model_config["feature"]["sampling_rate"])
105
 
106
 
107
+ def load_whisper_model():
108
+ """Load and cache Whisper model for transcription."""
109
+ global _whisper_model_cache
110
+
111
+ if _whisper_model_cache is None:
112
+ print("Loading Whisper model for transcription...")
113
+ # Use base model for faster transcription
114
+ _whisper_model_cache = whisper.load_model("base")
115
+
116
+ return _whisper_model_cache
117
+
118
+
119
+ def transcribe_audio_whisper(audio_file):
120
+ """Transcribe audio file using Whisper."""
121
+ if audio_file is None:
122
+ return "Error: Please upload an audio file first."
123
+
124
+ try:
125
+ # Load Whisper model
126
+ model = load_whisper_model()
127
+
128
+ # Transcribe the audio
129
+ result = model.transcribe(audio_file, language="en")
130
+
131
+ return result["text"].strip()
132
+
133
+ except Exception as e:
134
+ return f"Error during transcription: {str(e)}"
135
+
136
+
137
  @spaces.GPU
138
  def synthesize_speech_gradio(
139
  text: str,
 
244
  gr.HTML("""
245
  <div class="title">🎡 ZipVoice</div>
246
  <div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
247
+ <div style="text-align: center; color: #64748b; font-size: 0.9em; margin-bottom: 1em;">
248
+ Upload audio, click "Transcribe Audio" to get automatic transcription, then generate speech in that voice!
249
+ </div>
250
  """)
251
 
252
  with gr.Row():
 
285
  lines=2
286
  )
287
 
288
+ transcribe_btn = gr.Button(
289
+ "🎀 Transcribe Audio",
290
+ variant="secondary",
291
+ size="sm"
292
+ )
293
+
294
  generate_btn = gr.Button(
295
  "🎡 Generate Speech",
296
  variant="primary",
 
320
  )
321
 
322
  # Event handling
323
+ transcribe_btn.click(
324
+ fn=transcribe_audio_whisper,
325
+ inputs=[prompt_audio],
326
+ outputs=[prompt_text]
327
+ )
328
+
329
  generate_btn.click(
330
  fn=synthesize_speech_gradio,
331
  inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
requirements.txt CHANGED
@@ -11,6 +11,7 @@ vocos
11
  pydub
12
  gradio==5.47.0
13
  spaces
 
14
 
15
  # Normalization
16
  cn2an
 
11
  pydub
12
  gradio==5.47.0
13
  spaces
14
+ openai-whisper
15
 
16
  # Normalization
17
  cn2an