Spaces:
Paused
Paused
Add Whisper transcription feature for automatic audio-to-text
Browse files- app.py +47 -0
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -11,6 +11,7 @@ import gradio as gr
|
|
| 11 |
import torch
|
| 12 |
from pathlib import Path
|
| 13 |
import spaces
|
|
|
|
| 14 |
|
| 15 |
# Add current directory to Python path for local zipvoice package
|
| 16 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
@@ -29,6 +30,7 @@ _models_cache = {}
|
|
| 29 |
_tokenizer_cache = None
|
| 30 |
_vocoder_cache = None
|
| 31 |
_feature_extractor_cache = None
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def load_models_and_components(model_name: str):
|
|
@@ -102,6 +104,36 @@ def load_models_and_components(model_name: str):
|
|
| 102 |
model_config["feature"]["sampling_rate"])
|
| 103 |
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
@spaces.GPU
|
| 106 |
def synthesize_speech_gradio(
|
| 107 |
text: str,
|
|
@@ -212,6 +244,9 @@ def create_gradio_interface():
|
|
| 212 |
gr.HTML("""
|
| 213 |
<div class="title">π΅ ZipVoice</div>
|
| 214 |
<div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
|
|
|
|
|
|
|
|
|
|
| 215 |
""")
|
| 216 |
|
| 217 |
with gr.Row():
|
|
@@ -250,6 +285,12 @@ def create_gradio_interface():
|
|
| 250 |
lines=2
|
| 251 |
)
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
generate_btn = gr.Button(
|
| 254 |
"π΅ Generate Speech",
|
| 255 |
variant="primary",
|
|
@@ -279,6 +320,12 @@ def create_gradio_interface():
|
|
| 279 |
)
|
| 280 |
|
| 281 |
# Event handling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
generate_btn.click(
|
| 283 |
fn=synthesize_speech_gradio,
|
| 284 |
inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
|
|
|
|
| 11 |
import torch
|
| 12 |
from pathlib import Path
|
| 13 |
import spaces
|
| 14 |
+
import whisper
|
| 15 |
|
| 16 |
# Add current directory to Python path for local zipvoice package
|
| 17 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
| 30 |
_tokenizer_cache = None
|
| 31 |
_vocoder_cache = None
|
| 32 |
_feature_extractor_cache = None
|
| 33 |
+
_whisper_model_cache = None
|
| 34 |
|
| 35 |
|
| 36 |
def load_models_and_components(model_name: str):
|
|
|
|
| 104 |
model_config["feature"]["sampling_rate"])
|
| 105 |
|
| 106 |
|
| 107 |
+
def load_whisper_model():
|
| 108 |
+
"""Load and cache Whisper model for transcription."""
|
| 109 |
+
global _whisper_model_cache
|
| 110 |
+
|
| 111 |
+
if _whisper_model_cache is None:
|
| 112 |
+
print("Loading Whisper model for transcription...")
|
| 113 |
+
# Use base model for faster transcription
|
| 114 |
+
_whisper_model_cache = whisper.load_model("base")
|
| 115 |
+
|
| 116 |
+
return _whisper_model_cache
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def transcribe_audio_whisper(audio_file):
|
| 120 |
+
"""Transcribe audio file using Whisper."""
|
| 121 |
+
if audio_file is None:
|
| 122 |
+
return "Error: Please upload an audio file first."
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
# Load Whisper model
|
| 126 |
+
model = load_whisper_model()
|
| 127 |
+
|
| 128 |
+
# Transcribe the audio
|
| 129 |
+
result = model.transcribe(audio_file, language="en")
|
| 130 |
+
|
| 131 |
+
return result["text"].strip()
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
return f"Error during transcription: {str(e)}"
|
| 135 |
+
|
| 136 |
+
|
| 137 |
@spaces.GPU
|
| 138 |
def synthesize_speech_gradio(
|
| 139 |
text: str,
|
|
|
|
| 244 |
gr.HTML("""
|
| 245 |
<div class="title">π΅ ZipVoice</div>
|
| 246 |
<div class="subtitle">Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching</div>
|
| 247 |
+
<div style="text-align: center; color: #64748b; font-size: 0.9em; margin-bottom: 1em;">
|
| 248 |
+
Upload audio, click "Transcribe Audio" to get automatic transcription, then generate speech in that voice!
|
| 249 |
+
</div>
|
| 250 |
""")
|
| 251 |
|
| 252 |
with gr.Row():
|
|
|
|
| 285 |
lines=2
|
| 286 |
)
|
| 287 |
|
| 288 |
+
transcribe_btn = gr.Button(
|
| 289 |
+
"π€ Transcribe Audio",
|
| 290 |
+
variant="secondary",
|
| 291 |
+
size="sm"
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
generate_btn = gr.Button(
|
| 295 |
"π΅ Generate Speech",
|
| 296 |
variant="primary",
|
|
|
|
| 320 |
)
|
| 321 |
|
| 322 |
# Event handling
|
| 323 |
+
transcribe_btn.click(
|
| 324 |
+
fn=transcribe_audio_whisper,
|
| 325 |
+
inputs=[prompt_audio],
|
| 326 |
+
outputs=[prompt_text]
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
generate_btn.click(
|
| 330 |
fn=synthesize_speech_gradio,
|
| 331 |
inputs=[text_input, prompt_audio, prompt_text, model_dropdown, speed_slider],
|
requirements.txt
CHANGED
|
@@ -11,6 +11,7 @@ vocos
|
|
| 11 |
pydub
|
| 12 |
gradio==5.47.0
|
| 13 |
spaces
|
|
|
|
| 14 |
|
| 15 |
# Normalization
|
| 16 |
cn2an
|
|
|
|
| 11 |
pydub
|
| 12 |
gradio==5.47.0
|
| 13 |
spaces
|
| 14 |
+
openai-whisper
|
| 15 |
|
| 16 |
# Normalization
|
| 17 |
cn2an
|