Spaces:

KiWA001
/

kai-api-gateway

Running

KiWA001 commited on 3 days ago

Commit

cbd3c0a

1 Parent(s): 97ee402

Replace SpeechMA with Kokoro TTS - Fast & Natural

- Add Kokoro TTS provider (82M parameters, 24kHz quality)
- No browser automation, no CAPTCHA needed
- Default voice: Bella (Female, American) - Best quality
- Update requirements.txt with kokoro and soundfile
- Update TTS router endpoints for Kokoro
- Update dashboard with new voices (10 voices, 6 languages)
- API remains 11Labs-compatible

Voices available:
- American: Bella (♀), Sarah (♀), Michael (♂), Adam (♂)
- British: Emma (♀), George (♂)
- Spanish: Sofia (♀)
- French: Jean (♂)
- Japanese: Sakura (♀)
- Chinese: Li (♀)

Files changed (4) hide show

providers/kokoro_tts_provider.py +230 -0
requirements.txt +5 -0
static/docs.html +23 -28
tts_router.py +51 -69

providers/kokoro_tts_provider.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+Kokoro TTS Provider
+-------------------
+Fast, natural-sounding text-to-speech using Kokoro-82M model.
+No browser automation, no CAPTCHA, runs locally.
+Installation:
+    pip install kokoro soundfile
+Note: Requires espeak-ng for some languages:
+    - Ubuntu/Debian: apt-get install espeak-ng
+    - macOS: brew install espeak-ng
+"""
+import io
+import logging
+from typing import Optional
+import asyncio
+logger = logging.getLogger("kai_api.kokoro_tts")
+# Try to import Kokoro
+try:
+    from kokoro import KPipeline
+    import soundfile as sf
+    import torch
+    KOKORO_AVAILABLE = True
+except ImportError:
+    KOKORO_AVAILABLE = False
+    logger.warning("Kokoro not installed. Run: pip install kokoro soundfile")
+# Voice mapping - Kokoro uses codes like 'af_heart', 'am_michael', etc.
+# Format: lang_code_voice (e.g., 'af' = American Female, 'am' = American Male)
+KOKORO_VOICES = {
+    # American English - Female
+    "bella": {"code": "af_heart", "lang": "a", "gender": "Female", "accent": "American"},
+    "sarah": {"code": "af_heart", "lang": "a", "gender": "Female", "accent": "American"},
+    # American English - Male
+    "michael": {"code": "am_michael", "lang": "a", "gender": "Male", "accent": "American"},
+    "adam": {"code": "am_michael", "lang": "a", "gender": "Male", "accent": "American"},
+    # British English - Female
+    "emma": {"code": "bf_emma", "lang": "b", "gender": "Female", "accent": "British"},
+    # British English - Male
+    "george": {"code": "bm_george", "lang": "b", "gender": "Male", "accent": "British"},
+    # Spanish
+    "sofia": {"code": "ef_sofia", "lang": "e", "gender": "Female", "accent": "Spanish"},
+    # French
+    "jean": {"code": "ff_jean", "lang": "f", "gender": "Male", "accent": "French"},
+    # Japanese
+    "sakura": {"code": "jf_sakura", "lang": "j", "gender": "Female", "accent": "Japanese"},
+    # Chinese
+    "li": {"code": "zf_li", "lang": "z", "gender": "Female", "accent": "Chinese"},
+}
+# Default voice
+DEFAULT_VOICE = "bella"
+# Cache pipelines per language to avoid reloading
+_pipeline_cache = {}
+class KokoroTTSProvider:
+    """Kokoro Text-to-Speech Provider - Fast, natural, no browser needed."""
+    def __init__(self):
+        self.name = "kokoro"
+    @staticmethod
+    def is_available() -> bool:
+        """Check if Kokoro is installed and working."""
+        if not KOKORO_AVAILABLE:
+            return False
+        try:
+            # Try to initialize a pipeline
+            _ = KPipeline(lang_code='a')
+            return True
+        except Exception as e:
+            logger.error(f"Kokoro initialization failed: {e}")
+            return False
+    def get_available_voices(self) -> list[dict]:
+        """Return all available voices."""
+        voices = []
+        for voice_id, info in KOKORO_VOICES.items():
+            voices.append({
+                "voice_id": voice_id,
+                "name": voice_id.capitalize(),
+                "gender": info["gender"],
+                "language": info["lang"],
+                "accent": info["accent"],
+                "kokoro_code": info["code"]
+            })
+        return voices
+    def get_voice_info(self, voice_id: str) -> dict:
+        """Get voice information by voice_id."""
+        voice_id_lower = voice_id.lower()
+        # Try direct match
+        if voice_id_lower in KOKORO_VOICES:
+            info = KOKORO_VOICES[voice_id_lower]
+            return {
+                "voice_id": voice_id_lower,
+                "name": voice_id_lower.capitalize(),
+                **info
+            }
+        # Try to find by partial match
+        for vid, info in KOKORO_VOICES.items():
+            if voice_id_lower in vid:
+                return {
+                    "voice_id": vid,
+                    "name": vid.capitalize(),
+                    **info
+                }
+        # Return default
+        default_info = KOKORO_VOICES[DEFAULT_VOICE]
+        return {
+            "voice_id": DEFAULT_VOICE,
+            "name": DEFAULT_VOICE.capitalize(),
+            **default_info
+        }
+    async def generate_speech(
+        self,
+        text: str,
+        voice_id: str = "bella",
+        speed: float = 1.0,
+    ) -> Optional[bytes]:
+        """
+        Generate speech from text.
+        Args:
+            text: Text to convert (Kokoro works best with sentences)
+            voice_id: Voice to use
+            speed: Speech speed (0.5 to 2.0)
+        Returns:
+            MP3 audio data as bytes
+        """
+        if not KOKORO_AVAILABLE:
+            raise RuntimeError("Kokoro not installed. Run: pip install kokoro soundfile")
+        # Get voice info
+        voice_info = self.get_voice_info(voice_id)
+        kokoro_voice = voice_info["kokoro_code"]
+        lang_code = voice_info["language"]
+        logger.info(f"Kokoro TTS: voice={voice_info['voice_id']}, lang={lang_code}")
+        # Use thread pool for CPU-intensive TTS (Kokoro is CPU-based)
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(
+            None,
+            self._generate_sync,
+            text,
+            kokoro_voice,
+            lang_code,
+            speed
+        )
+    def _generate_sync(
+        self,
+        text: str,
+        voice: str,
+        lang_code: str,
+        speed: float
+    ) -> bytes:
+        """Synchronous generation (runs in thread pool)."""
+        try:
+            # Get or create pipeline for this language
+            if lang_code not in _pipeline_cache:
+                logger.info(f"Initializing Kokoro pipeline for language: {lang_code}")
+                _pipeline_cache[lang_code] = KPipeline(lang_code=lang_code)
+            pipeline = _pipeline_cache[lang_code]
+            # Generate audio
+            generator = pipeline(text, voice=voice, speed=speed)
+            # Collect all audio segments
+            audio_segments = []
+            for i, (gs, ps, audio) in enumerate(generator):
+                audio_segments.append(audio)
+                logger.debug(f"Generated segment {i}: {len(audio)} samples")
+            if not audio_segments:
+                raise ValueError("No audio generated")
+            # Concatenate all segments
+            import numpy as np
+            full_audio = np.concatenate(audio_segments)
+            # Convert to MP3 bytes
+            buffer = io.BytesIO()
+            sf.write(buffer, full_audio, 24000, format='MP3')
+            buffer.seek(0)
+            audio_bytes = buffer.getvalue()
+            logger.info(f"Kokoro: Generated {len(audio_bytes)} bytes of MP3 audio")
+            return audio_bytes
+        except Exception as e:
+            logger.error(f"Kokoro generation error: {e}")
+            raise
+    async def health_check(self) -> bool:
+        """Check if Kokoro is working."""
+        if not KOKORO_AVAILABLE:
+            return False
+        try:
+            # Quick test
+            test_pipeline = KPipeline(lang_code='a')
+            return True
+        except:
+            return False
+# Global provider instance
+_kokoro_provider = None
+def get_kokoro_provider() -> KokoroTTSProvider:
+    """Get or create the Kokoro provider singleton."""
+    global _kokoro_provider
+    if _kokoro_provider is None:
+        _kokoro_provider = KokoroTTSProvider()
+    return _kokoro_provider

requirements.txt CHANGED Viewed

@@ -6,6 +6,11 @@ httpx>=0.25.0
 pydantic>=2.0
 supabase>=2.0.0
 Pillow>=10.0.0
 # Search Engine
 requests>=2.31.0
 beautifulsoup4>=4.12.0

 pydantic>=2.0
 supabase>=2.0.0
 Pillow>=10.0.0
+# Text-to-Speech (Kokoro - Fast, Natural, No CAPTCHA)
+kokoro>=0.3.0
+soundfile>=0.12.0
 # Search Engine
 requests>=2.31.0
 beautifulsoup4>=4.12.0

static/docs.html CHANGED Viewed

@@ -621,7 +621,7 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/deep_research \
             <div class="endpoint-header">
                 <div class="endpoint-title">
                     <h3>Text-to-Speech</h3>
-                    <p>Convert text to natural-sounding speech using SpeechMA. Supports 20+ voices.</p>
                 </div>
                 <span class="method-badge">POST /v1/text-to-speech</span>
             </div>
@@ -635,7 +635,11 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/deep_research \
                         </tr>
                         <tr>
                             <td><span class="param-name">voice_id</span></td>
-                            <td><span class="param-desc">Voice to use (ava, andrew, brian, etc.)</span><span class="param-req">(optional)</span></td>
                         </tr>
                     </table>
                     <br>
@@ -644,41 +648,32 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/deep_research \
                         <button class="copy-btn" onclick="copyExample('ex-tts')">📋 Copy</button>
                     </div>
                     <div id="ex-tts" class="demo-response visible" style="color: #a78bfa;">
-curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/ava \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer YOUR_API_KEY" \
-  -d '{"text": "Hello! This is a test of the text to speech API."}' \
   --output speech.mp3
                     </div>
                     <p style="color: var(--text-muted); font-size: 13px; margin-top: 10px;">
-                        <strong>Default Voice:</strong> Ava Multilingual (Female, US)<br>
-                        <strong>Available Voices:</strong> ava, andrew, brian, emma, remy, vivienne, daniel, serena, matthew, jane, and more.
                     </p>
                 </div>
                 <div class="endpoint-demo">
                     <span class="demo-label">Try It Live</span>
-                    <textarea id="tts-input" class="demo-input" rows="3" placeholder="Enter text to convert to speech...">Hello! Welcome to K-AI API text-to-speech. This is Ava speaking.</textarea>
                     <select id="tts-voice" class="demo-select">
-                        <option value="ava" selected>Ava (Female, US) - Default</option>
-                        <option value="andrew">Andrew (Male, US)</option>
-                        <option value="brian">Brian (Male, US)</option>
-                        <option value="emma">Emma (Female, UK)</option>
-                        <option value="remy">Remy (Male, France)</option>
-                        <option value="vivienne">Vivienne (Female, US)</option>
-                        <option value="daniel">Daniel (Male, UK)</option>
-                        <option value="serena">Serena (Female, US)</option>
-                        <option value="matthew">Matthew (Male, US)</option>
-                        <option value="jane">Jane (Female, US)</option>
-                        <option value="alfonso">Alfonso (Male, Spain)</option>
-                        <option value="mario">Mario (Male, Italy)</option>
-                        <option value="klaus">Klaus (Male, Germany)</option>
-                        <option value="sakura">Sakura (Female, Japan)</option>
-                        <option value="xin">Xin (Female, China)</option>
-                        <option value="jose">Jose (Male, Brazil)</option>
-                        <option value="ines">Ines (Female, Portugal)</option>
-                        <option value="amira">Amira (Female, Saudi Arabia)</option>
-                        <option value="fatima">Fatima (Female, UAE)</option>
                     </select>
                     <button class="demo-btn" id="tts-generate-btn" onclick="generateTTS()">Generate Speech ▶</button>
@@ -1037,7 +1032,7 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/ava \
             generateBtn.textContent = 'Generating... ⏳';
             statusBox.style.display = 'block';
-            statusBox.innerHTML = 'Sending request to SpeechMA... <span style="opacity: 0.7;">(Solving CAPTCHA)</span>';
             statusBox.style.color = 'var(--text-muted)';
             const startTime = Date.now();
@@ -1106,7 +1101,7 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/ava \
                     <div style="margin-bottom:5px; font-weight:bold;">Error Generating Speech</div>
                     <div style="font-size: 12px; opacity: 0.9;">${err.message}</div>
                     <div style="margin-top: 8px; font-size: 11px; opacity: 0.7;">
-                        Tip: This might be due to CAPTCHA issues. Try refreshing the page.
                     </div>
                 `;
             } finally {

             <div class="endpoint-header">
                 <div class="endpoint-title">
                     <h3>Text-to-Speech</h3>
+                    <p>Convert text to natural-sounding speech using Kokoro AI. Fast, high-quality, no CAPTCHA!</p>
                 </div>
                 <span class="method-badge">POST /v1/text-to-speech</span>
             </div>
                         </tr>
                         <tr>
                             <td><span class="param-name">voice_id</span></td>
+                            <td><span class="param-desc">Voice to use (bella, michael, emma, etc.)</span><span class="param-req">(optional)</span></td>
+                        </tr>
+                        <tr>
+                            <td><span class="param-name">speed</span></td>
+                            <td><span class="param-desc">Speech speed (0.5 - 2.0)</span><span class="param-req">(optional)</span></td>
                         </tr>
                     </table>
                     <br>
                         <button class="copy-btn" onclick="copyExample('ex-tts')">📋 Copy</button>
                     </div>
                     <div id="ex-tts" class="demo-response visible" style="color: #a78bfa;">
+curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/bella \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer YOUR_API_KEY" \
+  -d '{"text": "Hello! This is Bella with a natural American accent."}' \
   --output speech.mp3
                     </div>
                     <p style="color: var(--text-muted); font-size: 13px; margin-top: 10px;">
+                        <strong>Default Voice:</strong> Bella (Female, American) - Natural & Expressive<br>
+                        <strong>Powered by:</strong> Kokoro-82M - Fast, natural TTS with 24kHz quality
                     </p>
                 </div>
                 <div class="endpoint-demo">
                     <span class="demo-label">Try It Live</span>
+                    <textarea id="tts-input" class="demo-input" rows="3" placeholder="Enter text to convert to speech...">Hello! I'm Bella, your AI assistant with a natural American accent. I can speak English, Spanish, French, Japanese, and more!</textarea>
                     <select id="tts-voice" class="demo-select">
+                        <option value="bella" selected>Bella (Female, American) - Best Quality</option>
+                        <option value="sarah">Sarah (Female, American)</option>
+                        <option value="michael">Michael (Male, American)</option>
+                        <option value="adam">Adam (Male, American)</option>
+                        <option value="emma">Emma (Female, British)</option>
+                        <option value="george">George (Male, British)</option>
+                        <option value="sofia">Sofia (Female, Spanish)</option>
+                        <option value="jean">Jean (Male, French)</option>
+                        <option value="sakura">Sakura (Female, Japanese)</option>
+                        <option value="li">Li (Female, Chinese)</option>
                     </select>
                     <button class="demo-btn" id="tts-generate-btn" onclick="generateTTS()">Generate Speech ▶</button>
             generateBtn.textContent = 'Generating... ⏳';
             statusBox.style.display = 'block';
+            statusBox.innerHTML = 'Generating with Kokoro AI... <span style="opacity: 0.7;">(Fast & Natural)</span>';
             statusBox.style.color = 'var(--text-muted)';
             const startTime = Date.now();
                     <div style="margin-bottom:5px; font-weight:bold;">Error Generating Speech</div>
                     <div style="font-size: 12px; opacity: 0.9;">${err.message}</div>
                     <div style="margin-top: 8px; font-size: 11px; opacity: 0.7;">
+                        Tip: Check that Kokoro is properly installed (pip install kokoro soundfile)
                     </div>
                 `;
             } finally {

tts_router.py CHANGED Viewed

@@ -2,7 +2,7 @@
 TTS Router - 11Labs Compatible API
 ----------------------------------
 Text-to-Speech endpoints compatible with ElevenLabs API structure.
-Uses SpeechMA as the backend provider.
 """
 from fastapi import APIRouter, Depends, HTTPException, Header, Request, Response
@@ -14,7 +14,7 @@ import uuid
 import json
 from auth import verify_api_key
-from providers.speechma_tts_provider import get_speechma_provider
 router = APIRouter()
@@ -32,16 +32,17 @@ class VoiceSettings(BaseModel):
 class TextToSpeechRequest(BaseModel):
     """11Labs-compatible TTS request."""
     text: str = Field(..., max_length=2000, description="Text to convert to speech")
-    model_id: Optional[str] = Field("eleven_multilingual_v2", description="Model ID (ignored, uses SpeechMA)")
     voice_settings: Optional[VoiceSettings] = Field(None, description="Voice settings")
     pronunciation_dictionary_locators: Optional[List[Dict[str, str]]] = None
     seed: Optional[int] = None
     previous_text: Optional[str] = None
     language_code: Optional[str] = None
-    # SpeechMA-specific fields
-    voice_id: Optional[str] = Field("ava", description="Voice ID to use")
-    output_format: Optional[str] = Field("mp3_44100_128", description="Output format")
     optimize_streaming_latency: Optional[int] = Field(0, ge=0, le=4)
@@ -117,19 +118,20 @@ class UserSubscriptionResponse(BaseModel):
 # --- Helper Functions ---
 def format_voice_to_11labs(voice_id: str, voice_info: dict) -> VoiceResponse:
-    """Convert SpeechMA voice to 11Labs format."""
     return VoiceResponse(
         voice_id=voice_id,
         name=voice_info["name"],
         category="premade",
         labels={
-            "accent": voice_info.get("country", "Multilingual"),
-            "description": f"{voice_info['gender']} {voice_info['language']} voice",
             "age": "adult",
             "gender": voice_info["gender"].lower(),
-            "use_case": "general"
         },
-        description=f"{voice_info['gender']} {voice_info['language']} voice from {voice_info.get('country', 'Unknown')}",
         settings=VoiceSettings()
     )
@@ -162,8 +164,8 @@ async def list_tts_models(
     models = [
         TTSModelInfo(
             model_id="eleven_multilingual_v2",
-            name="Eleven Multilingual v2",
-            description="Our most advanced multilingual model with highest quality",
             can_do_text_to_speech=True,
             can_do_voice_conversion=False,
             can_use_style=True,
@@ -183,14 +185,13 @@ async def list_tts_models(
                 {"language_id": "pt", "name": "Portuguese"},
                 {"language_id": "ja", "name": "Japanese"},
                 {"language_id": "zh", "name": "Chinese"},
-                {"language_id": "ar", "name": "Arabic"},
                 {"language_id": "hi", "name": "Hindi"},
             ]
         ),
         TTSModelInfo(
             model_id="eleven_flash_v2_5",
-            name="Eleven Flash v2.5",
-            description="Ultra-low latency model (~75ms)",
             can_do_text_to_speech=True,
             can_do_voice_conversion=False,
             can_use_style=False,
@@ -219,19 +220,13 @@ async def list_voices(
     """
     List all available voices.
     """
-    provider = get_speechma_provider()
     voices_data = provider.get_available_voices()
     voices = []
     for voice_data in voices_data:
         voice_id = voice_data["voice_id"]
-        info = {
-            "name": voice_data["name"],
-            "gender": voice_data["gender"],
-            "language": voice_data["language"],
-            "country": voice_data.get("country", "Unknown")
-        }
-        voices.append(format_voice_to_11labs(voice_id, info))
     return VoicesListResponse(voices=voices)
@@ -244,18 +239,13 @@ async def get_voice(
     """
     Get information about a specific voice.
     """
-    provider = get_speechma_provider()
     voice_info = provider.get_voice_info(voice_id)
     if not voice_info:
         raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
-    return format_voice_to_11labs(voice_info["voice_id"], {
-        "name": voice_info["name"],
-        "gender": voice_info["gender"],
-        "language": voice_info["language"],
-        "country": voice_info.get("country", "Unknown")
-    })
 @router.get("/v1/voices/{voice_id}/settings", response_model=VoiceSettings)
@@ -266,7 +256,7 @@ async def get_voice_settings(
     """
     Get default settings for a voice.
     """
-    provider = get_speechma_provider()
     voice_info = provider.get_voice_info(voice_id)
     if not voice_info:
@@ -289,28 +279,25 @@ async def text_to_speech(
     Returns audio data as MP3.
     """
-    provider = get_speechma_provider()
     # Validate voice
     voice_info = provider.get_voice_info(voice_id)
     if not voice_info:
         raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
-    # Use provided voice_id or from request
-    actual_voice_id = voice_id
     # Generate speech
     try:
         audio_data = await provider.generate_speech(
             text=request.text,
-            voice_id=actual_voice_id,
-            output_format=request.output_format or "mp3"
         )
         if audio_data is None:
             raise HTTPException(
                 status_code=500,
-                detail="Failed to generate speech. This could be due to CAPTCHA issues or site changes."
             )
         # Return audio with proper headers
@@ -341,11 +328,8 @@ async def text_to_speech_stream(
 ):
     """
     Convert text to speech with streaming response.
-    Note: Since SpeechMA generates complete audio files,
-    this returns the full audio as a stream.
     """
-    provider = get_speechma_provider()
     # Validate voice
     voice_info = provider.get_voice_info(voice_id)
@@ -356,7 +340,7 @@ async def text_to_speech_stream(
         audio_data = await provider.generate_speech(
             text=request.text,
             voice_id=voice_id,
-            output_format=request.output_format or "mp3"
         )
         if audio_data is None:
@@ -390,31 +374,27 @@ async def text_to_speech_stream(
         )
-# Additional SpeechMA-specific endpoints
-@router.post("/v1/tts/speechma")
-async def speechma_tts(
     request: Request,
     key_data: dict = Depends(verify_api_key)
 ):
     """
-    Direct SpeechMA TTS endpoint with custom options.
     Body: {
         "text": "Hello world",
-        "voice_id": "ava",
-        "pitch": 0,
-        "speed": 0,
-        "volume": 100
     }
     """
     data = await request.json()
     text = data.get("text")
-    voice_id = data.get("voice_id", "ava")
-    pitch = data.get("pitch", 0)
-    speed = data.get("speed", 0)
-    volume = data.get("volume", 100)
     if not text:
         raise HTTPException(status_code=400, detail="Text is required")
@@ -422,7 +402,7 @@ async def speechma_tts(
     if len(text) > 2000:
         raise HTTPException(status_code=400, detail="Text exceeds 2000 character limit")
-    provider = get_speechma_provider()
     # Validate voice
     voice_info = provider.get_voice_info(voice_id)
@@ -433,15 +413,13 @@ async def speechma_tts(
         audio_data = await provider.generate_speech(
             text=text,
             voice_id=voice_id,
-            pitch=pitch,
-            speed=speed,
-            volume=volume
         )
         if audio_data is None:
             raise HTTPException(
                 status_code=500,
-                detail="Failed to generate speech. This could be due to CAPTCHA issues."
             )
         return Response(
@@ -460,20 +438,22 @@ async def speechma_tts(
         )
-@router.get("/v1/tts/speechma/voices")
-async def speechma_voices(
     key_data: dict = Depends(verify_api_key)
 ):
     """
-    Get all available SpeechMA voices with full details.
     """
-    provider = get_speechma_provider()
     voices = provider.get_available_voices()
     return JSONResponse({
         "voices": voices,
         "count": len(voices),
-        "default_voice": "ava"
     })
@@ -483,18 +463,20 @@ async def tts_health_check():
     Check if TTS service is healthy.
     """
     try:
-        provider = get_speechma_provider()
         is_healthy = await provider.health_check()
         return JSONResponse({
             "status": "healthy" if is_healthy else "unhealthy",
-            "provider": "speechma",
             "timestamp": time.time()
         })
     except Exception as e:
         return JSONResponse({
             "status": "unhealthy",
-            "provider": "speechma",
             "error": str(e),
             "timestamp": time.time()
         }, status_code=503)

 TTS Router - 11Labs Compatible API
 ----------------------------------
 Text-to-Speech endpoints compatible with ElevenLabs API structure.
+Uses Kokoro as the backend provider - fast, natural, no CAPTCHA!
 """
 from fastapi import APIRouter, Depends, HTTPException, Header, Request, Response
 import json
 from auth import verify_api_key
+from providers.kokoro_tts_provider import get_kokoro_provider
 router = APIRouter()
 class TextToSpeechRequest(BaseModel):
     """11Labs-compatible TTS request."""
     text: str = Field(..., max_length=2000, description="Text to convert to speech")
+    model_id: Optional[str] = Field("eleven_multilingual_v2", description="Model ID (ignored, uses Kokoro)")
     voice_settings: Optional[VoiceSettings] = Field(None, description="Voice settings")
     pronunciation_dictionary_locators: Optional[List[Dict[str, str]]] = None
     seed: Optional[int] = None
     previous_text: Optional[str] = None
     language_code: Optional[str] = None
+    # Kokoro-specific fields
+    voice_id: Optional[str] = Field("bella", description="Voice ID to use")
+    output_format: Optional[str] = Field("mp3", description="Output format")
+    speed: Optional[float] = Field(1.0, ge=0.5, le=2.0, description="Speech speed")
     optimize_streaming_latency: Optional[int] = Field(0, ge=0, le=4)
 # --- Helper Functions ---
 def format_voice_to_11labs(voice_id: str, voice_info: dict) -> VoiceResponse:
+    """Convert Kokoro voice to 11Labs format."""
     return VoiceResponse(
         voice_id=voice_id,
         name=voice_info["name"],
         category="premade",
         labels={
+            "accent": voice_info.get("accent", "Unknown"),
+            "description": f"{voice_info['gender']} voice",
             "age": "adult",
             "gender": voice_info["gender"].lower(),
+            "use_case": "general",
+            "language_code": voice_info.get("language", "en")
         },
+        description=f"{voice_info['gender']} {voice_info.get('accent', 'Unknown')} voice",
         settings=VoiceSettings()
     )
     models = [
         TTSModelInfo(
             model_id="eleven_multilingual_v2",
+            name="Kokoro Multilingual",
+            description="Fast, natural TTS with Kokoro-82M model",
             can_do_text_to_speech=True,
             can_do_voice_conversion=False,
             can_use_style=True,
                 {"language_id": "pt", "name": "Portuguese"},
                 {"language_id": "ja", "name": "Japanese"},
                 {"language_id": "zh", "name": "Chinese"},
                 {"language_id": "hi", "name": "Hindi"},
             ]
         ),
         TTSModelInfo(
             model_id="eleven_flash_v2_5",
+            name="Kokoro Fast",
+            description="Ultra-fast TTS with lower latency",
             can_do_text_to_speech=True,
             can_do_voice_conversion=False,
             can_use_style=False,
     """
     List all available voices.
     """
+    provider = get_kokoro_provider()
     voices_data = provider.get_available_voices()
     voices = []
     for voice_data in voices_data:
         voice_id = voice_data["voice_id"]
+        voices.append(format_voice_to_11labs(voice_id, voice_data))
     return VoicesListResponse(voices=voices)
     """
     Get information about a specific voice.
     """
+    provider = get_kokoro_provider()
     voice_info = provider.get_voice_info(voice_id)
     if not voice_info:
         raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
+    return format_voice_to_11labs(voice_info["voice_id"], voice_info)
 @router.get("/v1/voices/{voice_id}/settings", response_model=VoiceSettings)
     """
     Get default settings for a voice.
     """
+    provider = get_kokoro_provider()
     voice_info = provider.get_voice_info(voice_id)
     if not voice_info:
     Returns audio data as MP3.
     """
+    provider = get_kokoro_provider()
     # Validate voice
     voice_info = provider.get_voice_info(voice_id)
     if not voice_info:
         raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
     # Generate speech
     try:
         audio_data = await provider.generate_speech(
             text=request.text,
+            voice_id=voice_id,
+            speed=request.speed or 1.0
         )
         if audio_data is None:
             raise HTTPException(
                 status_code=500,
+                detail="Failed to generate speech."
             )
         # Return audio with proper headers
 ):
     """
     Convert text to speech with streaming response.
     """
+    provider = get_kokoro_provider()
     # Validate voice
     voice_info = provider.get_voice_info(voice_id)
         audio_data = await provider.generate_speech(
             text=request.text,
             voice_id=voice_id,
+            speed=request.speed or 1.0
         )
         if audio_data is None:
         )
+# Kokoro-specific endpoints
+@router.post("/v1/tts/kokoro")
+async def kokoro_tts(
     request: Request,
     key_data: dict = Depends(verify_api_key)
 ):
     """
+    Direct Kokoro TTS endpoint with custom options.
     Body: {
         "text": "Hello world",
+        "voice_id": "bella",
+        "speed": 1.0
     }
     """
     data = await request.json()
     text = data.get("text")
+    voice_id = data.get("voice_id", "bella")
+    speed = data.get("speed", 1.0)
     if not text:
         raise HTTPException(status_code=400, detail="Text is required")
     if len(text) > 2000:
         raise HTTPException(status_code=400, detail="Text exceeds 2000 character limit")
+    provider = get_kokoro_provider()
     # Validate voice
     voice_info = provider.get_voice_info(voice_id)
         audio_data = await provider.generate_speech(
             text=text,
             voice_id=voice_id,
+            speed=speed
         )
         if audio_data is None:
             raise HTTPException(
                 status_code=500,
+                detail="Failed to generate speech."
             )
         return Response(
         )
+@router.get("/v1/tts/kokoro/voices")
+async def kokoro_voices(
     key_data: dict = Depends(verify_api_key)
 ):
     """
+    Get all available Kokoro voices with full details.
     """
+    provider = get_kokoro_provider()
     voices = provider.get_available_voices()
     return JSONResponse({
         "voices": voices,
         "count": len(voices),
+        "default_voice": "bella",
+        "provider": "kokoro",
+        "description": "Fast, natural TTS powered by Kokoro-82M"
     })
     Check if TTS service is healthy.
     """
     try:
+        provider = get_kokoro_provider()
         is_healthy = await provider.health_check()
         return JSONResponse({
             "status": "healthy" if is_healthy else "unhealthy",
+            "provider": "kokoro",
+            "model": "Kokoro-82M",
+            "description": "Fast, natural text-to-speech",
             "timestamp": time.time()
         })
     except Exception as e:
         return JSONResponse({
             "status": "unhealthy",
+            "provider": "kokoro",
             "error": str(e),
             "timestamp": time.time()
         }, status_code=503)