Spaces:

Hexa09
/

whisper2

Sleeping

App Files Files Community

whisper2 / app.py

Hexa06

Deploy WhisperX service

514a64d 22 days ago

raw

history blame contribute delete

3.25 kB

	from fastapi import FastAPI, File, UploadFile, Form
	from fastapi.responses import JSONResponse
	import gradio as gr
	import whisperx
	import torch
	import tempfile
	import os
	import uvicorn
	from threading import Thread

	# Device setup
	device = "cuda" if torch.cuda.is_available() else "cpu"
	compute_type = "float16" if device == "cuda" else "int8"

	print(f"🚀 Device: {device}, Compute: {compute_type}")

	# Create FastAPI app
	app = FastAPI(title="WhisperX Alignment API")

	def process_audio(audio_path: str, language: str = "en"):
	"""Core alignment logic"""
	try:
	print(f"📝 Processing {audio_path} ({language})...")

	# Load model
	model = whisperx.load_model("base", device=device, compute_type=compute_type)

	# Transcribe
	result = model.transcribe(audio_path, language=language)

	# Align
	align_model, metadata = whisperx.load_align_model(language_code=language, device=device)
	aligned = whisperx.align(result["segments"], align_model, metadata, audio_path, device=device)

	# Extract word segments
	word_segments = []
	for segment in aligned["segments"]:
	for word in segment.get("words", []):
	word_segments.append({
	"word": word["word"].strip(),
	"start": round(word["start"], 2),
	"end": round(word["end"], 2)
	})

	duration = aligned["segments"][-1]["end"] if aligned["segments"] else 0

	return {
	"word_segments": word_segments,
	"duration": round(duration, 2),
	"word_count": len(word_segments),
	"language": language,
	"device": device
	}
	except Exception as e:
	print(f"❌ Error: {e}")
	return {"error": str(e)}

	# FastAPI endpoint
	@app.post("/align")
	async def align_audio_api(
	audio_file: UploadFile = File(...),
	language: str = Form("en")
	):
	"""REST API endpoint for audio alignment"""
	temp_path = None
	try:
	# Save temp file
	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
	content = await audio_file.read()
	tmp.write(content)
	temp_path = tmp.name

	# Process
	result = process_audio(temp_path, language)
	return JSONResponse(result)

	finally:
	if temp_path and os.path.exists(temp_path):
	os.unlink(temp_path)

	@app.get("/")
	def health():
	return {"status": "healthy", "device": device}

	# Gradio interface
	def align_gradio(audio_file, language="en"):
	"""Gradio UI wrapper"""
	if not audio_file:
	return {"error": "No file"}
	return process_audio(audio_file, language)

	gradio_app = gr.Interface(
	fn=align_gradio,
	inputs=[
	gr.Audio(type="filepath", label="Audio"),
	gr.Textbox(value="en", label="Language")
	],
	outputs=gr.JSON(label="Result"),
	title="🎯 WhisperX Alignment",
	description="Upload audio for word-level timestamps"
	)

	# Mount Gradio to FastAPI
	app = gr.mount_gradio_app(app, gradio_app, path="/")

	# Launch
	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)