|
|
from fastapi import FastAPI, HTTPException, Form, BackgroundTasks |
|
|
from fastapi.responses import FileResponse |
|
|
from kokoro_onnx import Kokoro |
|
|
import tempfile |
|
|
import os |
|
|
from datetime import datetime |
|
|
import soundfile as sf |
|
|
|
|
|
|
|
|
MAX_CHARS = 4500 |
|
|
MIN_CHARS = 5 |
|
|
MAX_AUDIO_DURATION = 300 |
|
|
|
|
|
|
|
|
print("🎤 Loading Kokoro TTS model...") |
|
|
try: |
|
|
kokoro = Kokoro("kokoro-v0_19.onnx", "voices") |
|
|
print("✅ Kokoro TTS loaded successfully!") |
|
|
except Exception as e: |
|
|
print(f"⚠️ Kokoro not found locally. Will download on first use.") |
|
|
kokoro = None |
|
|
|
|
|
app = FastAPI( |
|
|
title="Kokoro TTS API - Fast & Simple", |
|
|
description="High-speed text-to-speech with emotional voices", |
|
|
version="2.0" |
|
|
) |
|
|
|
|
|
@app.on_event("startup") |
|
|
def startup(): |
|
|
global kokoro |
|
|
if kokoro is None: |
|
|
import urllib.request |
|
|
|
|
|
print("📥 Downloading Kokoro TTS model files...") |
|
|
|
|
|
|
|
|
os.makedirs("voices", exist_ok=True) |
|
|
|
|
|
|
|
|
voices_file = "voices/voices.bin" |
|
|
if not os.path.exists(voices_file): |
|
|
print("Downloading voices.bin...") |
|
|
urllib.request.urlretrieve( |
|
|
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin", |
|
|
voices_file |
|
|
) |
|
|
print("✅ Voices downloaded!") |
|
|
|
|
|
|
|
|
model_file = "kokoro-v0_19.onnx" |
|
|
if not os.path.exists(model_file): |
|
|
print("Downloading kokoro-v0_19.onnx...") |
|
|
urllib.request.urlretrieve( |
|
|
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx", |
|
|
model_file |
|
|
) |
|
|
print("✅ Model downloaded!") |
|
|
|
|
|
print("🎤 Initializing Kokoro TTS...") |
|
|
kokoro = Kokoro(model_file, voices_file) |
|
|
print("✅ Kokoro TTS loaded!") |
|
|
|
|
|
|
|
|
def cleanup_file(path: str): |
|
|
"""Delete temporary file after response is sent""" |
|
|
try: |
|
|
if os.path.exists(path): |
|
|
os.unlink(path) |
|
|
except: |
|
|
pass |
|
|
|
|
|
def generate_speech(text: str, voice: str = "bf_isabella", speed: float = 1.0) -> str: |
|
|
""" |
|
|
Generate speech using Kokoro TTS |
|
|
Available voices: af_heart, af_bella, am_adam, am_michael, bf_emma, bf_isabella |
|
|
""" |
|
|
if len(text) < MIN_CHARS: |
|
|
raise ValueError(f"Text too short. Minimum {MIN_CHARS} characters.") |
|
|
if len(text) > MAX_CHARS: |
|
|
raise ValueError(f"Text too long. Maximum {MAX_CHARS} characters (~5 min audio).") |
|
|
|
|
|
|
|
|
samples, sample_rate = kokoro.create( |
|
|
text=text, |
|
|
voice=voice, |
|
|
speed=speed, |
|
|
lang="en-us" |
|
|
) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: |
|
|
sf.write(tmp.name, samples, sample_rate) |
|
|
return tmp.name |
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
def root(): |
|
|
return { |
|
|
"service": "Kokoro TTS API", |
|
|
"status": "running", |
|
|
"model": "Kokoro-82M", |
|
|
"version": "2.0", |
|
|
"features": { |
|
|
"speed": "10x faster than XTTS", |
|
|
"voices": 6, |
|
|
"max_chars": MAX_CHARS, |
|
|
"emotional": True |
|
|
}, |
|
|
"endpoints": { |
|
|
"health": "/health", |
|
|
"generate": "/api/generate (POST)", |
|
|
"docs": "/docs" |
|
|
} |
|
|
} |
|
|
|
|
|
@app.get("/health") |
|
|
def health(): |
|
|
return { |
|
|
"status": "healthy", |
|
|
"model": "Kokoro TTS 82M", |
|
|
"speed": "10x faster than XTTS", |
|
|
"max_chars": MAX_CHARS, |
|
|
"voices": ["af_heart", "af_bella", "am_adam", "am_michael", "bf_emma", "bf_isabella"] |
|
|
} |
|
|
|
|
|
@app.post("/api/generate") |
|
|
async def generate_tts( |
|
|
background_tasks: BackgroundTasks, |
|
|
text: str = Form(..., description="Text to convert to speech"), |
|
|
voice: str = Form("bf_isabella", description="Voice to use"), |
|
|
speed: float = Form(1.0, description="Speech speed (0.5-2.0)") |
|
|
): |
|
|
""" |
|
|
Generate TTS with Kokoro (Fast & Emotional) |
|
|
|
|
|
**Performance:** |
|
|
- Max audio: 5 minutes (4500 chars) |
|
|
- Generation: ~20-30 seconds on CPU |
|
|
- Speech rate: ~900 chars/minute |
|
|
|
|
|
**Available Voices:** |
|
|
- `af_heart`: American Female (warm) |
|
|
- `af_bella`: American Female (professional) |
|
|
- `am_adam`: American Male (confident) |
|
|
- `am_michael`: American Male (friendly) |
|
|
- `bf_emma`: British Female (elegant) |
|
|
- `bf_isabella`: British Female (storytelling) ⭐ Best for long content |
|
|
|
|
|
**Example:** |
|
|
```bash |
|
|
curl -X POST https://your-space.hf.space/api/generate \\ |
|
|
-F "text=Hello world, this is Kokoro TTS!" \\ |
|
|
-F "voice=bf_isabella" \\ |
|
|
-F "speed=1.0" \\ |
|
|
--output audio.wav |
|
|
``` |
|
|
""" |
|
|
try: |
|
|
|
|
|
if speed < 0.5 or speed > 2.0: |
|
|
raise HTTPException(status_code=400, detail="Speed must be between 0.5 and 2.0") |
|
|
|
|
|
|
|
|
output_path = generate_speech(text.strip(), voice, speed) |
|
|
|
|
|
|
|
|
background_tasks.add_task(cleanup_file, output_path) |
|
|
|
|
|
|
|
|
response = FileResponse( |
|
|
output_path, |
|
|
media_type="audio/wav", |
|
|
filename=f"kokoro_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav" |
|
|
) |
|
|
response.headers["X-Character-Count"] = str(len(text)) |
|
|
response.headers["X-Voice-Used"] = voice |
|
|
|
|
|
return response |
|
|
|
|
|
except ValueError as e: |
|
|
raise HTTPException(status_code=400, detail=str(e)) |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|