Hexa06 commited on
Commit
8c9cdca
Β·
1 Parent(s): 343086f

Deploy WhisperX service

Browse files
Files changed (3) hide show
  1. app.py +110 -0
  2. packages.txt +1 -0
  3. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
+ from fastapi.responses import JSONResponse
3
+ import gradio as gr
4
+ import whisperx
5
+ import torch
6
+ import tempfile
7
+ import os
8
+ import uvicorn
9
+ from threading import Thread
10
+
11
+ # Device setup
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ compute_type = "float16" if device == "cuda" else "int8"
14
+
15
+ print(f"πŸš€ Device: {device}, Compute: {compute_type}")
16
+
17
+ # Create FastAPI app
18
+ app = FastAPI(title="WhisperX Alignment API")
19
+
20
+ def process_audio(audio_path: str, language: str = "en"):
21
+ """Core alignment logic"""
22
+ try:
23
+ print(f"πŸ“ Processing {audio_path} ({language})...")
24
+
25
+ # Load model
26
+ model = whisperx.load_model("base", device=device, compute_type=compute_type)
27
+
28
+ # Transcribe
29
+ result = model.transcribe(audio_path, language=language)
30
+
31
+ # Align
32
+ align_model, metadata = whisperx.load_align_model(language_code=language, device=device)
33
+ aligned = whisperx.align(result["segments"], align_model, metadata, audio_path, device=device)
34
+
35
+ # Extract word segments
36
+ word_segments = []
37
+ for segment in aligned["segments"]:
38
+ for word in segment.get("words", []):
39
+ word_segments.append({
40
+ "word": word["word"].strip(),
41
+ "start": round(word["start"], 2),
42
+ "end": round(word["end"], 2)
43
+ })
44
+
45
+ duration = aligned["segments"][-1]["end"] if aligned["segments"] else 0
46
+
47
+ return {
48
+ "word_segments": word_segments,
49
+ "duration": round(duration, 2),
50
+ "word_count": len(word_segments),
51
+ "language": language,
52
+ "device": device
53
+ }
54
+ except Exception as e:
55
+ print(f"❌ Error: {e}")
56
+ return {"error": str(e)}
57
+
58
+ # FastAPI endpoint
59
+ @app.post("/align")
60
+ async def align_audio_api(
61
+ audio_file: UploadFile = File(...),
62
+ language: str = Form("en")
63
+ ):
64
+ """REST API endpoint for audio alignment"""
65
+ temp_path = None
66
+ try:
67
+ # Save temp file
68
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
69
+ content = await audio_file.read()
70
+ tmp.write(content)
71
+ temp_path = tmp.name
72
+
73
+ # Process
74
+ result = process_audio(temp_path, language)
75
+ return JSONResponse(result)
76
+
77
+ finally:
78
+ if temp_path and os.path.exists(temp_path):
79
+ os.unlink(temp_path)
80
+
81
+ @app.get("/")
82
+ def health():
83
+ return {"status": "healthy", "device": device}
84
+
85
+ # Gradio interface
86
+ def align_gradio(audio_file, language="en"):
87
+ """Gradio UI wrapper"""
88
+ if not audio_file:
89
+ return {"error": "No file"}
90
+ return process_audio(audio_file, language)
91
+
92
+ gradio_app = gr.Interface(
93
+ fn=align_gradio,
94
+ inputs=[
95
+ gr.Audio(type="filepath", label="Audio"),
96
+ gr.Textbox(value="en", label="Language")
97
+ ],
98
+ outputs=gr.JSON(label="Result"),
99
+ title="🎯 WhisperX Alignment",
100
+ description="Upload audio for word-level timestamps"
101
+ )
102
+
103
+ # Mount Gradio to FastAPI
104
+ app = gr.mount_gradio_app(app, gradio_app, path="/")
105
+
106
+ # Launch
107
+ if __name__ == "__main__":
108
+ uvicorn.run(app, host="0.0.0.0", port=7860)
109
+
110
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/m-bain/whisperx.git
2
+ fastapi
3
+ uvicorn[standard]
4
+ python-multipart
5
+
6
+
7
+
8
+
9
+