Spaces:
Paused
Paused
WIP
Browse files- app.py +24 -59
- requirements.txt +0 -2
app.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
-
import torch
|
| 2 |
import gradio as gr
|
| 3 |
import subprocess
|
| 4 |
import datetime
|
| 5 |
import tempfile
|
| 6 |
-
|
| 7 |
from loguru import logger
|
|
|
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def format_time(seconds):
|
| 12 |
"""Convert seconds to SRT time format (HH:MM:SS,mmm).
|
|
@@ -66,7 +70,7 @@ def generate_srt(chunks):
|
|
| 66 |
for i, chunk in enumerate(chunks, 1):
|
| 67 |
start_time = format_time(chunk["timestamp"][0])
|
| 68 |
end_time = format_time(chunk["timestamp"][1])
|
| 69 |
-
text = chunk
|
| 70 |
srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
|
| 71 |
return "".join(srt_content)
|
| 72 |
|
|
@@ -106,35 +110,13 @@ def check_ffmpeg():
|
|
| 106 |
# Initialize ffmpeg check
|
| 107 |
check_ffmpeg()
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
logger.info(f"Using device: {device}")
|
| 112 |
-
|
| 113 |
-
def create_pipeline():
|
| 114 |
-
"""Create a new pipeline with optimized settings for T4 GPU.
|
| 115 |
-
|
| 116 |
-
Returns:
|
| 117 |
-
transformers.Pipeline: Configured speech recognition pipeline.
|
| 118 |
-
"""
|
| 119 |
-
return pipeline(
|
| 120 |
-
task="automatic-speech-recognition",
|
| 121 |
-
model=MODEL_NAME,
|
| 122 |
-
device=device,
|
| 123 |
-
)
|
| 124 |
-
|
| 125 |
-
# Initialize pipeline once
|
| 126 |
-
pipe = create_pipeline()
|
| 127 |
-
logger.info(f"Pipeline initialized: {pipe}")
|
| 128 |
-
|
| 129 |
-
def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_length_s):
|
| 130 |
-
"""Transcribe audio input using Whisper model.
|
| 131 |
|
| 132 |
Args:
|
| 133 |
inputs (str): Path to audio file to transcribe.
|
| 134 |
return_timestamps (bool): Whether to include timestamps in output.
|
| 135 |
generate_subs (bool): Whether to generate SRT subtitles.
|
| 136 |
-
batch_size (int): Number of chunks to process in parallel.
|
| 137 |
-
chunk_length_s (int): Length of audio chunks in seconds.
|
| 138 |
|
| 139 |
Returns:
|
| 140 |
tuple: (formatted_result, srt_file, correction_text)
|
|
@@ -152,27 +134,20 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
|
|
| 152 |
try:
|
| 153 |
logger.info(f"Processing audio file: {inputs}")
|
| 154 |
|
| 155 |
-
#
|
| 156 |
-
|
|
|
|
| 157 |
|
| 158 |
-
#
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
|
| 163 |
-
|
| 164 |
-
result = pipe(
|
| 165 |
-
inputs,
|
| 166 |
-
batch_size=batch_size,
|
| 167 |
-
chunk_length_s=chunk_length_s,
|
| 168 |
-
stride_length_s=stride_length_s,
|
| 169 |
-
return_timestamps="word" if return_timestamps else False
|
| 170 |
-
)
|
| 171 |
-
logger.debug(f"Pipeline result: {result}")
|
| 172 |
|
| 173 |
# Format response as JSON
|
| 174 |
formatted_result = {
|
| 175 |
-
"text": result
|
| 176 |
}
|
| 177 |
|
| 178 |
chunks = []
|
|
@@ -208,17 +183,11 @@ def transcribe(inputs, return_timestamps, generate_subs, batch_size, chunk_lengt
|
|
| 208 |
srt_file = save_srt_to_file(srt_content)
|
| 209 |
logger.info("SRT subtitles generated successfully")
|
| 210 |
|
| 211 |
-
# Clear CUDA cache after processing
|
| 212 |
-
if torch.cuda.is_available():
|
| 213 |
-
torch.cuda.empty_cache()
|
| 214 |
-
logger.debug("Cleared CUDA cache after processing")
|
| 215 |
-
|
| 216 |
return formatted_result, srt_file, "" # Return empty string for correction textbox
|
|
|
|
|
|
|
|
|
|
| 217 |
except Exception as e:
|
| 218 |
-
# Ensure CUDA cache is cleared even if there's an error
|
| 219 |
-
if torch.cuda.is_available():
|
| 220 |
-
torch.cuda.empty_cache()
|
| 221 |
-
logger.debug("Cleared CUDA cache after error")
|
| 222 |
logger.exception(f"Error during transcription: {str(e)}")
|
| 223 |
raise gr.Error(f"Failed to transcribe audio: {str(e)}")
|
| 224 |
|
|
@@ -232,8 +201,6 @@ mf_transcribe = gr.Interface(
|
|
| 232 |
gr.Audio(sources="microphone", type="filepath"),
|
| 233 |
gr.Checkbox(label="Include timestamps", value=True),
|
| 234 |
gr.Checkbox(label="Generate subtitles", value=True),
|
| 235 |
-
gr.Slider(minimum=1, maximum=64, value=8, step=1, label="Batch Size"),
|
| 236 |
-
gr.Slider(minimum=5, maximum=30, value=30, step=5, label="Chunk Length (seconds)"),
|
| 237 |
],
|
| 238 |
outputs=[
|
| 239 |
gr.JSON(label="Transcription", open=True),
|
|
@@ -242,7 +209,7 @@ mf_transcribe = gr.Interface(
|
|
| 242 |
title="Whisper Large V3 Turbo: Transcribe Audio",
|
| 243 |
description=(
|
| 244 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
| 245 |
-
f" checkpoint [{
|
| 246 |
" of arbitrary length."
|
| 247 |
)
|
| 248 |
)
|
|
@@ -253,8 +220,6 @@ file_transcribe = gr.Interface(
|
|
| 253 |
gr.Audio(sources="upload", type="filepath", label="Audio file"),
|
| 254 |
gr.Checkbox(label="Include timestamps", value=True),
|
| 255 |
gr.Checkbox(label="Generate subtitles", value=True),
|
| 256 |
-
gr.Slider(minimum=1, maximum=32, value=8, step=1, label="Batch Size"),
|
| 257 |
-
gr.Slider(minimum=5, maximum=30, value=15, step=5, label="Chunk Length (seconds)"),
|
| 258 |
],
|
| 259 |
outputs=[
|
| 260 |
gr.JSON(label="Transcription", open=True),
|
|
@@ -263,7 +228,7 @@ file_transcribe = gr.Interface(
|
|
| 263 |
title="Whisper Large V3: Transcribe Audio",
|
| 264 |
description=(
|
| 265 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
| 266 |
-
f" checkpoint [{
|
| 267 |
" of arbitrary length."
|
| 268 |
)
|
| 269 |
)
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
| 3 |
import datetime
|
| 4 |
import tempfile
|
| 5 |
+
import requests
|
| 6 |
from loguru import logger
|
| 7 |
+
from os import getenv
|
| 8 |
|
| 9 |
+
API_URL = getenv("API_URL")
|
| 10 |
+
headers = {
|
| 11 |
+
"Accept": "application/json",
|
| 12 |
+
"Content-Type": "audio/flac"
|
| 13 |
+
}
|
| 14 |
|
| 15 |
def format_time(seconds):
|
| 16 |
"""Convert seconds to SRT time format (HH:MM:SS,mmm).
|
|
|
|
| 70 |
for i, chunk in enumerate(chunks, 1):
|
| 71 |
start_time = format_time(chunk["timestamp"][0])
|
| 72 |
end_time = format_time(chunk["timestamp"][1])
|
| 73 |
+
text = chunk.get("text", "").strip()
|
| 74 |
srt_content.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
|
| 75 |
return "".join(srt_content)
|
| 76 |
|
|
|
|
| 110 |
# Initialize ffmpeg check
|
| 111 |
check_ffmpeg()
|
| 112 |
|
| 113 |
+
def transcribe(inputs, return_timestamps, generate_subs):
|
| 114 |
+
"""Transcribe audio input using Whisper model via Hugging Face Inference API.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
Args:
|
| 117 |
inputs (str): Path to audio file to transcribe.
|
| 118 |
return_timestamps (bool): Whether to include timestamps in output.
|
| 119 |
generate_subs (bool): Whether to generate SRT subtitles.
|
|
|
|
|
|
|
| 120 |
|
| 121 |
Returns:
|
| 122 |
tuple: (formatted_result, srt_file, correction_text)
|
|
|
|
| 134 |
try:
|
| 135 |
logger.info(f"Processing audio file: {inputs}")
|
| 136 |
|
| 137 |
+
# Read the audio file
|
| 138 |
+
with open(inputs, "rb") as f:
|
| 139 |
+
data = f.read()
|
| 140 |
|
| 141 |
+
# Send request to API
|
| 142 |
+
response = requests.post(API_URL, headers=headers, data=data)
|
| 143 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 144 |
+
result = response.json()
|
| 145 |
|
| 146 |
+
logger.debug(f"API response: {result}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
# Format response as JSON
|
| 149 |
formatted_result = {
|
| 150 |
+
"text": result.get("text", "")
|
| 151 |
}
|
| 152 |
|
| 153 |
chunks = []
|
|
|
|
| 183 |
srt_file = save_srt_to_file(srt_content)
|
| 184 |
logger.info("SRT subtitles generated successfully")
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
return formatted_result, srt_file, "" # Return empty string for correction textbox
|
| 187 |
+
except requests.exceptions.RequestException as e:
|
| 188 |
+
logger.exception(f"API request failed: {str(e)}")
|
| 189 |
+
raise gr.Error(f"Failed to transcribe audio: API request failed - {str(e)}")
|
| 190 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
logger.exception(f"Error during transcription: {str(e)}")
|
| 192 |
raise gr.Error(f"Failed to transcribe audio: {str(e)}")
|
| 193 |
|
|
|
|
| 201 |
gr.Audio(sources="microphone", type="filepath"),
|
| 202 |
gr.Checkbox(label="Include timestamps", value=True),
|
| 203 |
gr.Checkbox(label="Generate subtitles", value=True),
|
|
|
|
|
|
|
| 204 |
],
|
| 205 |
outputs=[
|
| 206 |
gr.JSON(label="Transcription", open=True),
|
|
|
|
| 209 |
title="Whisper Large V3 Turbo: Transcribe Audio",
|
| 210 |
description=(
|
| 211 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
| 212 |
+
f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
|
| 213 |
" of arbitrary length."
|
| 214 |
)
|
| 215 |
)
|
|
|
|
| 220 |
gr.Audio(sources="upload", type="filepath", label="Audio file"),
|
| 221 |
gr.Checkbox(label="Include timestamps", value=True),
|
| 222 |
gr.Checkbox(label="Generate subtitles", value=True),
|
|
|
|
|
|
|
| 223 |
],
|
| 224 |
outputs=[
|
| 225 |
gr.JSON(label="Transcription", open=True),
|
|
|
|
| 228 |
title="Whisper Large V3: Transcribe Audio",
|
| 229 |
description=(
|
| 230 |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
|
| 231 |
+
f" checkpoint [{API_URL}](https://huggingface.co/{API_URL}) and 🤗 Transformers to transcribe audio files"
|
| 232 |
" of arbitrary length."
|
| 233 |
)
|
| 234 |
)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,2 @@
|
|
| 1 |
-
transformers
|
| 2 |
loguru
|
| 3 |
-
torch
|
| 4 |
gradio
|
|
|
|
|
|
|
| 1 |
loguru
|
|
|
|
| 2 |
gradio
|