KiWA001 commited on
Commit
cbd3c0a
·
1 Parent(s): 97ee402

Replace SpeechMA with Kokoro TTS - Fast & Natural

Browse files

- Add Kokoro TTS provider (82M parameters, 24kHz quality)
- No browser automation, no CAPTCHA needed
- Default voice: Bella (Female, American) - Best quality
- Update requirements.txt with kokoro and soundfile
- Update TTS router endpoints for Kokoro
- Update dashboard with new voices (10 voices, 6 languages)
- API remains 11Labs-compatible

Voices available:
- American: Bella (♀), Sarah (♀), Michael (♂), Adam (♂)
- British: Emma (♀), George (♂)
- Spanish: Sofia (♀)
- French: Jean (♂)
- Japanese: Sakura (♀)
- Chinese: Li (♀)

providers/kokoro_tts_provider.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kokoro TTS Provider
3
+ -------------------
4
+ Fast, natural-sounding text-to-speech using Kokoro-82M model.
5
+ No browser automation, no CAPTCHA, runs locally.
6
+
7
+ Installation:
8
+ pip install kokoro soundfile
9
+
10
+ Note: Requires espeak-ng for some languages:
11
+ - Ubuntu/Debian: apt-get install espeak-ng
12
+ - macOS: brew install espeak-ng
13
+ """
14
+
15
+ import io
16
+ import logging
17
+ from typing import Optional
18
+ import asyncio
19
+
20
+ logger = logging.getLogger("kai_api.kokoro_tts")
21
+
22
+ # Try to import Kokoro
23
+ try:
24
+ from kokoro import KPipeline
25
+ import soundfile as sf
26
+ import torch
27
+ KOKORO_AVAILABLE = True
28
+ except ImportError:
29
+ KOKORO_AVAILABLE = False
30
+ logger.warning("Kokoro not installed. Run: pip install kokoro soundfile")
31
+
32
+
33
+ # Voice mapping - Kokoro uses codes like 'af_heart', 'am_michael', etc.
34
+ # Format: lang_code_voice (e.g., 'af' = American Female, 'am' = American Male)
35
+ KOKORO_VOICES = {
36
+ # American English - Female
37
+ "bella": {"code": "af_heart", "lang": "a", "gender": "Female", "accent": "American"},
38
+ "sarah": {"code": "af_heart", "lang": "a", "gender": "Female", "accent": "American"},
39
+ # American English - Male
40
+ "michael": {"code": "am_michael", "lang": "a", "gender": "Male", "accent": "American"},
41
+ "adam": {"code": "am_michael", "lang": "a", "gender": "Male", "accent": "American"},
42
+ # British English - Female
43
+ "emma": {"code": "bf_emma", "lang": "b", "gender": "Female", "accent": "British"},
44
+ # British English - Male
45
+ "george": {"code": "bm_george", "lang": "b", "gender": "Male", "accent": "British"},
46
+ # Spanish
47
+ "sofia": {"code": "ef_sofia", "lang": "e", "gender": "Female", "accent": "Spanish"},
48
+ # French
49
+ "jean": {"code": "ff_jean", "lang": "f", "gender": "Male", "accent": "French"},
50
+ # Japanese
51
+ "sakura": {"code": "jf_sakura", "lang": "j", "gender": "Female", "accent": "Japanese"},
52
+ # Chinese
53
+ "li": {"code": "zf_li", "lang": "z", "gender": "Female", "accent": "Chinese"},
54
+ }
55
+
56
+ # Default voice
57
+ DEFAULT_VOICE = "bella"
58
+
59
+ # Cache pipelines per language to avoid reloading
60
+ _pipeline_cache = {}
61
+
62
+
63
+ class KokoroTTSProvider:
64
+ """Kokoro Text-to-Speech Provider - Fast, natural, no browser needed."""
65
+
66
+ def __init__(self):
67
+ self.name = "kokoro"
68
+
69
+ @staticmethod
70
+ def is_available() -> bool:
71
+ """Check if Kokoro is installed and working."""
72
+ if not KOKORO_AVAILABLE:
73
+ return False
74
+ try:
75
+ # Try to initialize a pipeline
76
+ _ = KPipeline(lang_code='a')
77
+ return True
78
+ except Exception as e:
79
+ logger.error(f"Kokoro initialization failed: {e}")
80
+ return False
81
+
82
+ def get_available_voices(self) -> list[dict]:
83
+ """Return all available voices."""
84
+ voices = []
85
+ for voice_id, info in KOKORO_VOICES.items():
86
+ voices.append({
87
+ "voice_id": voice_id,
88
+ "name": voice_id.capitalize(),
89
+ "gender": info["gender"],
90
+ "language": info["lang"],
91
+ "accent": info["accent"],
92
+ "kokoro_code": info["code"]
93
+ })
94
+ return voices
95
+
96
+ def get_voice_info(self, voice_id: str) -> dict:
97
+ """Get voice information by voice_id."""
98
+ voice_id_lower = voice_id.lower()
99
+
100
+ # Try direct match
101
+ if voice_id_lower in KOKORO_VOICES:
102
+ info = KOKORO_VOICES[voice_id_lower]
103
+ return {
104
+ "voice_id": voice_id_lower,
105
+ "name": voice_id_lower.capitalize(),
106
+ **info
107
+ }
108
+
109
+ # Try to find by partial match
110
+ for vid, info in KOKORO_VOICES.items():
111
+ if voice_id_lower in vid:
112
+ return {
113
+ "voice_id": vid,
114
+ "name": vid.capitalize(),
115
+ **info
116
+ }
117
+
118
+ # Return default
119
+ default_info = KOKORO_VOICES[DEFAULT_VOICE]
120
+ return {
121
+ "voice_id": DEFAULT_VOICE,
122
+ "name": DEFAULT_VOICE.capitalize(),
123
+ **default_info
124
+ }
125
+
126
+ async def generate_speech(
127
+ self,
128
+ text: str,
129
+ voice_id: str = "bella",
130
+ speed: float = 1.0,
131
+ ) -> Optional[bytes]:
132
+ """
133
+ Generate speech from text.
134
+
135
+ Args:
136
+ text: Text to convert (Kokoro works best with sentences)
137
+ voice_id: Voice to use
138
+ speed: Speech speed (0.5 to 2.0)
139
+
140
+ Returns:
141
+ MP3 audio data as bytes
142
+ """
143
+ if not KOKORO_AVAILABLE:
144
+ raise RuntimeError("Kokoro not installed. Run: pip install kokoro soundfile")
145
+
146
+ # Get voice info
147
+ voice_info = self.get_voice_info(voice_id)
148
+ kokoro_voice = voice_info["kokoro_code"]
149
+ lang_code = voice_info["language"]
150
+
151
+ logger.info(f"Kokoro TTS: voice={voice_info['voice_id']}, lang={lang_code}")
152
+
153
+ # Use thread pool for CPU-intensive TTS (Kokoro is CPU-based)
154
+ loop = asyncio.get_event_loop()
155
+ return await loop.run_in_executor(
156
+ None,
157
+ self._generate_sync,
158
+ text,
159
+ kokoro_voice,
160
+ lang_code,
161
+ speed
162
+ )
163
+
164
+ def _generate_sync(
165
+ self,
166
+ text: str,
167
+ voice: str,
168
+ lang_code: str,
169
+ speed: float
170
+ ) -> bytes:
171
+ """Synchronous generation (runs in thread pool)."""
172
+ try:
173
+ # Get or create pipeline for this language
174
+ if lang_code not in _pipeline_cache:
175
+ logger.info(f"Initializing Kokoro pipeline for language: {lang_code}")
176
+ _pipeline_cache[lang_code] = KPipeline(lang_code=lang_code)
177
+
178
+ pipeline = _pipeline_cache[lang_code]
179
+
180
+ # Generate audio
181
+ generator = pipeline(text, voice=voice, speed=speed)
182
+
183
+ # Collect all audio segments
184
+ audio_segments = []
185
+ for i, (gs, ps, audio) in enumerate(generator):
186
+ audio_segments.append(audio)
187
+ logger.debug(f"Generated segment {i}: {len(audio)} samples")
188
+
189
+ if not audio_segments:
190
+ raise ValueError("No audio generated")
191
+
192
+ # Concatenate all segments
193
+ import numpy as np
194
+ full_audio = np.concatenate(audio_segments)
195
+
196
+ # Convert to MP3 bytes
197
+ buffer = io.BytesIO()
198
+ sf.write(buffer, full_audio, 24000, format='MP3')
199
+ buffer.seek(0)
200
+
201
+ audio_bytes = buffer.getvalue()
202
+ logger.info(f"Kokoro: Generated {len(audio_bytes)} bytes of MP3 audio")
203
+
204
+ return audio_bytes
205
+
206
+ except Exception as e:
207
+ logger.error(f"Kokoro generation error: {e}")
208
+ raise
209
+
210
+ async def health_check(self) -> bool:
211
+ """Check if Kokoro is working."""
212
+ if not KOKORO_AVAILABLE:
213
+ return False
214
+ try:
215
+ # Quick test
216
+ test_pipeline = KPipeline(lang_code='a')
217
+ return True
218
+ except:
219
+ return False
220
+
221
+
222
+ # Global provider instance
223
+ _kokoro_provider = None
224
+
225
+ def get_kokoro_provider() -> KokoroTTSProvider:
226
+ """Get or create the Kokoro provider singleton."""
227
+ global _kokoro_provider
228
+ if _kokoro_provider is None:
229
+ _kokoro_provider = KokoroTTSProvider()
230
+ return _kokoro_provider
requirements.txt CHANGED
@@ -6,6 +6,11 @@ httpx>=0.25.0
6
  pydantic>=2.0
7
  supabase>=2.0.0
8
  Pillow>=10.0.0
 
 
 
 
 
9
  # Search Engine
10
  requests>=2.31.0
11
  beautifulsoup4>=4.12.0
 
6
  pydantic>=2.0
7
  supabase>=2.0.0
8
  Pillow>=10.0.0
9
+
10
+ # Text-to-Speech (Kokoro - Fast, Natural, No CAPTCHA)
11
+ kokoro>=0.3.0
12
+ soundfile>=0.12.0
13
+
14
  # Search Engine
15
  requests>=2.31.0
16
  beautifulsoup4>=4.12.0
static/docs.html CHANGED
@@ -621,7 +621,7 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/deep_research \
621
  <div class="endpoint-header">
622
  <div class="endpoint-title">
623
  <h3>Text-to-Speech</h3>
624
- <p>Convert text to natural-sounding speech using SpeechMA. Supports 20+ voices.</p>
625
  </div>
626
  <span class="method-badge">POST /v1/text-to-speech</span>
627
  </div>
@@ -635,7 +635,11 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/deep_research \
635
  </tr>
636
  <tr>
637
  <td><span class="param-name">voice_id</span></td>
638
- <td><span class="param-desc">Voice to use (ava, andrew, brian, etc.)</span><span class="param-req">(optional)</span></td>
 
 
 
 
639
  </tr>
640
  </table>
641
  <br>
@@ -644,41 +648,32 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/deep_research \
644
  <button class="copy-btn" onclick="copyExample('ex-tts')">📋 Copy</button>
645
  </div>
646
  <div id="ex-tts" class="demo-response visible" style="color: #a78bfa;">
647
- curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/ava \
648
  -H "Content-Type: application/json" \
649
  -H "Authorization: Bearer YOUR_API_KEY" \
650
- -d '{"text": "Hello! This is a test of the text to speech API."}' \
651
  --output speech.mp3
652
  </div>
653
  <p style="color: var(--text-muted); font-size: 13px; margin-top: 10px;">
654
- <strong>Default Voice:</strong> Ava Multilingual (Female, US)<br>
655
- <strong>Available Voices:</strong> ava, andrew, brian, emma, remy, vivienne, daniel, serena, matthew, jane, and more.
656
  </p>
657
  </div>
658
  <div class="endpoint-demo">
659
  <span class="demo-label">Try It Live</span>
660
- <textarea id="tts-input" class="demo-input" rows="3" placeholder="Enter text to convert to speech...">Hello! Welcome to K-AI API text-to-speech. This is Ava speaking.</textarea>
661
 
662
  <select id="tts-voice" class="demo-select">
663
- <option value="ava" selected>Ava (Female, US) - Default</option>
664
- <option value="andrew">Andrew (Male, US)</option>
665
- <option value="brian">Brian (Male, US)</option>
666
- <option value="emma">Emma (Female, UK)</option>
667
- <option value="remy">Remy (Male, France)</option>
668
- <option value="vivienne">Vivienne (Female, US)</option>
669
- <option value="daniel">Daniel (Male, UK)</option>
670
- <option value="serena">Serena (Female, US)</option>
671
- <option value="matthew">Matthew (Male, US)</option>
672
- <option value="jane">Jane (Female, US)</option>
673
- <option value="alfonso">Alfonso (Male, Spain)</option>
674
- <option value="mario">Mario (Male, Italy)</option>
675
- <option value="klaus">Klaus (Male, Germany)</option>
676
- <option value="sakura">Sakura (Female, Japan)</option>
677
- <option value="xin">Xin (Female, China)</option>
678
- <option value="jose">Jose (Male, Brazil)</option>
679
- <option value="ines">Ines (Female, Portugal)</option>
680
- <option value="amira">Amira (Female, Saudi Arabia)</option>
681
- <option value="fatima">Fatima (Female, UAE)</option>
682
  </select>
683
 
684
  <button class="demo-btn" id="tts-generate-btn" onclick="generateTTS()">Generate Speech ▶</button>
@@ -1037,7 +1032,7 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/ava \
1037
  generateBtn.textContent = 'Generating... ⏳';
1038
 
1039
  statusBox.style.display = 'block';
1040
- statusBox.innerHTML = 'Sending request to SpeechMA... <span style="opacity: 0.7;">(Solving CAPTCHA)</span>';
1041
  statusBox.style.color = 'var(--text-muted)';
1042
 
1043
  const startTime = Date.now();
@@ -1106,7 +1101,7 @@ curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/ava \
1106
  <div style="margin-bottom:5px; font-weight:bold;">Error Generating Speech</div>
1107
  <div style="font-size: 12px; opacity: 0.9;">${err.message}</div>
1108
  <div style="margin-top: 8px; font-size: 11px; opacity: 0.7;">
1109
- Tip: This might be due to CAPTCHA issues. Try refreshing the page.
1110
  </div>
1111
  `;
1112
  } finally {
 
621
  <div class="endpoint-header">
622
  <div class="endpoint-title">
623
  <h3>Text-to-Speech</h3>
624
+ <p>Convert text to natural-sounding speech using Kokoro AI. Fast, high-quality, no CAPTCHA!</p>
625
  </div>
626
  <span class="method-badge">POST /v1/text-to-speech</span>
627
  </div>
 
635
  </tr>
636
  <tr>
637
  <td><span class="param-name">voice_id</span></td>
638
+ <td><span class="param-desc">Voice to use (bella, michael, emma, etc.)</span><span class="param-req">(optional)</span></td>
639
+ </tr>
640
+ <tr>
641
+ <td><span class="param-name">speed</span></td>
642
+ <td><span class="param-desc">Speech speed (0.5 - 2.0)</span><span class="param-req">(optional)</span></td>
643
  </tr>
644
  </table>
645
  <br>
 
648
  <button class="copy-btn" onclick="copyExample('ex-tts')">📋 Copy</button>
649
  </div>
650
  <div id="ex-tts" class="demo-response visible" style="color: #a78bfa;">
651
+ curl -X POST https://kiwa001-kai-api-gateway.hf.space/v1/text-to-speech/bella \
652
  -H "Content-Type: application/json" \
653
  -H "Authorization: Bearer YOUR_API_KEY" \
654
+ -d '{"text": "Hello! This is Bella with a natural American accent."}' \
655
  --output speech.mp3
656
  </div>
657
  <p style="color: var(--text-muted); font-size: 13px; margin-top: 10px;">
658
+ <strong>Default Voice:</strong> Bella (Female, American) - Natural & Expressive<br>
659
+ <strong>Powered by:</strong> Kokoro-82M - Fast, natural TTS with 24kHz quality
660
  </p>
661
  </div>
662
  <div class="endpoint-demo">
663
  <span class="demo-label">Try It Live</span>
664
+ <textarea id="tts-input" class="demo-input" rows="3" placeholder="Enter text to convert to speech...">Hello! I'm Bella, your AI assistant with a natural American accent. I can speak English, Spanish, French, Japanese, and more!</textarea>
665
 
666
  <select id="tts-voice" class="demo-select">
667
+ <option value="bella" selected>Bella (Female, American) - Best Quality</option>
668
+ <option value="sarah">Sarah (Female, American)</option>
669
+ <option value="michael">Michael (Male, American)</option>
670
+ <option value="adam">Adam (Male, American)</option>
671
+ <option value="emma">Emma (Female, British)</option>
672
+ <option value="george">George (Male, British)</option>
673
+ <option value="sofia">Sofia (Female, Spanish)</option>
674
+ <option value="jean">Jean (Male, French)</option>
675
+ <option value="sakura">Sakura (Female, Japanese)</option>
676
+ <option value="li">Li (Female, Chinese)</option>
 
 
 
 
 
 
 
 
 
677
  </select>
678
 
679
  <button class="demo-btn" id="tts-generate-btn" onclick="generateTTS()">Generate Speech ▶</button>
 
1032
  generateBtn.textContent = 'Generating... ⏳';
1033
 
1034
  statusBox.style.display = 'block';
1035
+ statusBox.innerHTML = 'Generating with Kokoro AI... <span style="opacity: 0.7;">(Fast & Natural)</span>';
1036
  statusBox.style.color = 'var(--text-muted)';
1037
 
1038
  const startTime = Date.now();
 
1101
  <div style="margin-bottom:5px; font-weight:bold;">Error Generating Speech</div>
1102
  <div style="font-size: 12px; opacity: 0.9;">${err.message}</div>
1103
  <div style="margin-top: 8px; font-size: 11px; opacity: 0.7;">
1104
+ Tip: Check that Kokoro is properly installed (pip install kokoro soundfile)
1105
  </div>
1106
  `;
1107
  } finally {
tts_router.py CHANGED
@@ -2,7 +2,7 @@
2
  TTS Router - 11Labs Compatible API
3
  ----------------------------------
4
  Text-to-Speech endpoints compatible with ElevenLabs API structure.
5
- Uses SpeechMA as the backend provider.
6
  """
7
 
8
  from fastapi import APIRouter, Depends, HTTPException, Header, Request, Response
@@ -14,7 +14,7 @@ import uuid
14
  import json
15
 
16
  from auth import verify_api_key
17
- from providers.speechma_tts_provider import get_speechma_provider
18
 
19
  router = APIRouter()
20
 
@@ -32,16 +32,17 @@ class VoiceSettings(BaseModel):
32
  class TextToSpeechRequest(BaseModel):
33
  """11Labs-compatible TTS request."""
34
  text: str = Field(..., max_length=2000, description="Text to convert to speech")
35
- model_id: Optional[str] = Field("eleven_multilingual_v2", description="Model ID (ignored, uses SpeechMA)")
36
  voice_settings: Optional[VoiceSettings] = Field(None, description="Voice settings")
37
  pronunciation_dictionary_locators: Optional[List[Dict[str, str]]] = None
38
  seed: Optional[int] = None
39
  previous_text: Optional[str] = None
40
  language_code: Optional[str] = None
41
 
42
- # SpeechMA-specific fields
43
- voice_id: Optional[str] = Field("ava", description="Voice ID to use")
44
- output_format: Optional[str] = Field("mp3_44100_128", description="Output format")
 
45
  optimize_streaming_latency: Optional[int] = Field(0, ge=0, le=4)
46
 
47
 
@@ -117,19 +118,20 @@ class UserSubscriptionResponse(BaseModel):
117
  # --- Helper Functions ---
118
 
119
  def format_voice_to_11labs(voice_id: str, voice_info: dict) -> VoiceResponse:
120
- """Convert SpeechMA voice to 11Labs format."""
121
  return VoiceResponse(
122
  voice_id=voice_id,
123
  name=voice_info["name"],
124
  category="premade",
125
  labels={
126
- "accent": voice_info.get("country", "Multilingual"),
127
- "description": f"{voice_info['gender']} {voice_info['language']} voice",
128
  "age": "adult",
129
  "gender": voice_info["gender"].lower(),
130
- "use_case": "general"
 
131
  },
132
- description=f"{voice_info['gender']} {voice_info['language']} voice from {voice_info.get('country', 'Unknown')}",
133
  settings=VoiceSettings()
134
  )
135
 
@@ -162,8 +164,8 @@ async def list_tts_models(
162
  models = [
163
  TTSModelInfo(
164
  model_id="eleven_multilingual_v2",
165
- name="Eleven Multilingual v2",
166
- description="Our most advanced multilingual model with highest quality",
167
  can_do_text_to_speech=True,
168
  can_do_voice_conversion=False,
169
  can_use_style=True,
@@ -183,14 +185,13 @@ async def list_tts_models(
183
  {"language_id": "pt", "name": "Portuguese"},
184
  {"language_id": "ja", "name": "Japanese"},
185
  {"language_id": "zh", "name": "Chinese"},
186
- {"language_id": "ar", "name": "Arabic"},
187
  {"language_id": "hi", "name": "Hindi"},
188
  ]
189
  ),
190
  TTSModelInfo(
191
  model_id="eleven_flash_v2_5",
192
- name="Eleven Flash v2.5",
193
- description="Ultra-low latency model (~75ms)",
194
  can_do_text_to_speech=True,
195
  can_do_voice_conversion=False,
196
  can_use_style=False,
@@ -219,19 +220,13 @@ async def list_voices(
219
  """
220
  List all available voices.
221
  """
222
- provider = get_speechma_provider()
223
  voices_data = provider.get_available_voices()
224
 
225
  voices = []
226
  for voice_data in voices_data:
227
  voice_id = voice_data["voice_id"]
228
- info = {
229
- "name": voice_data["name"],
230
- "gender": voice_data["gender"],
231
- "language": voice_data["language"],
232
- "country": voice_data.get("country", "Unknown")
233
- }
234
- voices.append(format_voice_to_11labs(voice_id, info))
235
 
236
  return VoicesListResponse(voices=voices)
237
 
@@ -244,18 +239,13 @@ async def get_voice(
244
  """
245
  Get information about a specific voice.
246
  """
247
- provider = get_speechma_provider()
248
  voice_info = provider.get_voice_info(voice_id)
249
 
250
  if not voice_info:
251
  raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
252
 
253
- return format_voice_to_11labs(voice_info["voice_id"], {
254
- "name": voice_info["name"],
255
- "gender": voice_info["gender"],
256
- "language": voice_info["language"],
257
- "country": voice_info.get("country", "Unknown")
258
- })
259
 
260
 
261
  @router.get("/v1/voices/{voice_id}/settings", response_model=VoiceSettings)
@@ -266,7 +256,7 @@ async def get_voice_settings(
266
  """
267
  Get default settings for a voice.
268
  """
269
- provider = get_speechma_provider()
270
  voice_info = provider.get_voice_info(voice_id)
271
 
272
  if not voice_info:
@@ -289,28 +279,25 @@ async def text_to_speech(
289
 
290
  Returns audio data as MP3.
291
  """
292
- provider = get_speechma_provider()
293
 
294
  # Validate voice
295
  voice_info = provider.get_voice_info(voice_id)
296
  if not voice_info:
297
  raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
298
 
299
- # Use provided voice_id or from request
300
- actual_voice_id = voice_id
301
-
302
  # Generate speech
303
  try:
304
  audio_data = await provider.generate_speech(
305
  text=request.text,
306
- voice_id=actual_voice_id,
307
- output_format=request.output_format or "mp3"
308
  )
309
 
310
  if audio_data is None:
311
  raise HTTPException(
312
  status_code=500,
313
- detail="Failed to generate speech. This could be due to CAPTCHA issues or site changes."
314
  )
315
 
316
  # Return audio with proper headers
@@ -341,11 +328,8 @@ async def text_to_speech_stream(
341
  ):
342
  """
343
  Convert text to speech with streaming response.
344
-
345
- Note: Since SpeechMA generates complete audio files,
346
- this returns the full audio as a stream.
347
  """
348
- provider = get_speechma_provider()
349
 
350
  # Validate voice
351
  voice_info = provider.get_voice_info(voice_id)
@@ -356,7 +340,7 @@ async def text_to_speech_stream(
356
  audio_data = await provider.generate_speech(
357
  text=request.text,
358
  voice_id=voice_id,
359
- output_format=request.output_format or "mp3"
360
  )
361
 
362
  if audio_data is None:
@@ -390,31 +374,27 @@ async def text_to_speech_stream(
390
  )
391
 
392
 
393
- # Additional SpeechMA-specific endpoints
394
 
395
- @router.post("/v1/tts/speechma")
396
- async def speechma_tts(
397
  request: Request,
398
  key_data: dict = Depends(verify_api_key)
399
  ):
400
  """
401
- Direct SpeechMA TTS endpoint with custom options.
402
 
403
  Body: {
404
  "text": "Hello world",
405
- "voice_id": "ava",
406
- "pitch": 0,
407
- "speed": 0,
408
- "volume": 100
409
  }
410
  """
411
  data = await request.json()
412
 
413
  text = data.get("text")
414
- voice_id = data.get("voice_id", "ava")
415
- pitch = data.get("pitch", 0)
416
- speed = data.get("speed", 0)
417
- volume = data.get("volume", 100)
418
 
419
  if not text:
420
  raise HTTPException(status_code=400, detail="Text is required")
@@ -422,7 +402,7 @@ async def speechma_tts(
422
  if len(text) > 2000:
423
  raise HTTPException(status_code=400, detail="Text exceeds 2000 character limit")
424
 
425
- provider = get_speechma_provider()
426
 
427
  # Validate voice
428
  voice_info = provider.get_voice_info(voice_id)
@@ -433,15 +413,13 @@ async def speechma_tts(
433
  audio_data = await provider.generate_speech(
434
  text=text,
435
  voice_id=voice_id,
436
- pitch=pitch,
437
- speed=speed,
438
- volume=volume
439
  )
440
 
441
  if audio_data is None:
442
  raise HTTPException(
443
  status_code=500,
444
- detail="Failed to generate speech. This could be due to CAPTCHA issues."
445
  )
446
 
447
  return Response(
@@ -460,20 +438,22 @@ async def speechma_tts(
460
  )
461
 
462
 
463
- @router.get("/v1/tts/speechma/voices")
464
- async def speechma_voices(
465
  key_data: dict = Depends(verify_api_key)
466
  ):
467
  """
468
- Get all available SpeechMA voices with full details.
469
  """
470
- provider = get_speechma_provider()
471
  voices = provider.get_available_voices()
472
 
473
  return JSONResponse({
474
  "voices": voices,
475
  "count": len(voices),
476
- "default_voice": "ava"
 
 
477
  })
478
 
479
 
@@ -483,18 +463,20 @@ async def tts_health_check():
483
  Check if TTS service is healthy.
484
  """
485
  try:
486
- provider = get_speechma_provider()
487
  is_healthy = await provider.health_check()
488
 
489
  return JSONResponse({
490
  "status": "healthy" if is_healthy else "unhealthy",
491
- "provider": "speechma",
 
 
492
  "timestamp": time.time()
493
  })
494
  except Exception as e:
495
  return JSONResponse({
496
  "status": "unhealthy",
497
- "provider": "speechma",
498
  "error": str(e),
499
  "timestamp": time.time()
500
  }, status_code=503)
 
2
  TTS Router - 11Labs Compatible API
3
  ----------------------------------
4
  Text-to-Speech endpoints compatible with ElevenLabs API structure.
5
+ Uses Kokoro as the backend provider - fast, natural, no CAPTCHA!
6
  """
7
 
8
  from fastapi import APIRouter, Depends, HTTPException, Header, Request, Response
 
14
  import json
15
 
16
  from auth import verify_api_key
17
+ from providers.kokoro_tts_provider import get_kokoro_provider
18
 
19
  router = APIRouter()
20
 
 
32
  class TextToSpeechRequest(BaseModel):
33
  """11Labs-compatible TTS request."""
34
  text: str = Field(..., max_length=2000, description="Text to convert to speech")
35
+ model_id: Optional[str] = Field("eleven_multilingual_v2", description="Model ID (ignored, uses Kokoro)")
36
  voice_settings: Optional[VoiceSettings] = Field(None, description="Voice settings")
37
  pronunciation_dictionary_locators: Optional[List[Dict[str, str]]] = None
38
  seed: Optional[int] = None
39
  previous_text: Optional[str] = None
40
  language_code: Optional[str] = None
41
 
42
+ # Kokoro-specific fields
43
+ voice_id: Optional[str] = Field("bella", description="Voice ID to use")
44
+ output_format: Optional[str] = Field("mp3", description="Output format")
45
+ speed: Optional[float] = Field(1.0, ge=0.5, le=2.0, description="Speech speed")
46
  optimize_streaming_latency: Optional[int] = Field(0, ge=0, le=4)
47
 
48
 
 
118
  # --- Helper Functions ---
119
 
120
  def format_voice_to_11labs(voice_id: str, voice_info: dict) -> VoiceResponse:
121
+ """Convert Kokoro voice to 11Labs format."""
122
  return VoiceResponse(
123
  voice_id=voice_id,
124
  name=voice_info["name"],
125
  category="premade",
126
  labels={
127
+ "accent": voice_info.get("accent", "Unknown"),
128
+ "description": f"{voice_info['gender']} voice",
129
  "age": "adult",
130
  "gender": voice_info["gender"].lower(),
131
+ "use_case": "general",
132
+ "language_code": voice_info.get("language", "en")
133
  },
134
+ description=f"{voice_info['gender']} {voice_info.get('accent', 'Unknown')} voice",
135
  settings=VoiceSettings()
136
  )
137
 
 
164
  models = [
165
  TTSModelInfo(
166
  model_id="eleven_multilingual_v2",
167
+ name="Kokoro Multilingual",
168
+ description="Fast, natural TTS with Kokoro-82M model",
169
  can_do_text_to_speech=True,
170
  can_do_voice_conversion=False,
171
  can_use_style=True,
 
185
  {"language_id": "pt", "name": "Portuguese"},
186
  {"language_id": "ja", "name": "Japanese"},
187
  {"language_id": "zh", "name": "Chinese"},
 
188
  {"language_id": "hi", "name": "Hindi"},
189
  ]
190
  ),
191
  TTSModelInfo(
192
  model_id="eleven_flash_v2_5",
193
+ name="Kokoro Fast",
194
+ description="Ultra-fast TTS with lower latency",
195
  can_do_text_to_speech=True,
196
  can_do_voice_conversion=False,
197
  can_use_style=False,
 
220
  """
221
  List all available voices.
222
  """
223
+ provider = get_kokoro_provider()
224
  voices_data = provider.get_available_voices()
225
 
226
  voices = []
227
  for voice_data in voices_data:
228
  voice_id = voice_data["voice_id"]
229
+ voices.append(format_voice_to_11labs(voice_id, voice_data))
 
 
 
 
 
 
230
 
231
  return VoicesListResponse(voices=voices)
232
 
 
239
  """
240
  Get information about a specific voice.
241
  """
242
+ provider = get_kokoro_provider()
243
  voice_info = provider.get_voice_info(voice_id)
244
 
245
  if not voice_info:
246
  raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
247
 
248
+ return format_voice_to_11labs(voice_info["voice_id"], voice_info)
 
 
 
 
 
249
 
250
 
251
  @router.get("/v1/voices/{voice_id}/settings", response_model=VoiceSettings)
 
256
  """
257
  Get default settings for a voice.
258
  """
259
+ provider = get_kokoro_provider()
260
  voice_info = provider.get_voice_info(voice_id)
261
 
262
  if not voice_info:
 
279
 
280
  Returns audio data as MP3.
281
  """
282
+ provider = get_kokoro_provider()
283
 
284
  # Validate voice
285
  voice_info = provider.get_voice_info(voice_id)
286
  if not voice_info:
287
  raise HTTPException(status_code=404, detail=f"Voice '{voice_id}' not found")
288
 
 
 
 
289
  # Generate speech
290
  try:
291
  audio_data = await provider.generate_speech(
292
  text=request.text,
293
+ voice_id=voice_id,
294
+ speed=request.speed or 1.0
295
  )
296
 
297
  if audio_data is None:
298
  raise HTTPException(
299
  status_code=500,
300
+ detail="Failed to generate speech."
301
  )
302
 
303
  # Return audio with proper headers
 
328
  ):
329
  """
330
  Convert text to speech with streaming response.
 
 
 
331
  """
332
+ provider = get_kokoro_provider()
333
 
334
  # Validate voice
335
  voice_info = provider.get_voice_info(voice_id)
 
340
  audio_data = await provider.generate_speech(
341
  text=request.text,
342
  voice_id=voice_id,
343
+ speed=request.speed or 1.0
344
  )
345
 
346
  if audio_data is None:
 
374
  )
375
 
376
 
377
+ # Kokoro-specific endpoints
378
 
379
+ @router.post("/v1/tts/kokoro")
380
+ async def kokoro_tts(
381
  request: Request,
382
  key_data: dict = Depends(verify_api_key)
383
  ):
384
  """
385
+ Direct Kokoro TTS endpoint with custom options.
386
 
387
  Body: {
388
  "text": "Hello world",
389
+ "voice_id": "bella",
390
+ "speed": 1.0
 
 
391
  }
392
  """
393
  data = await request.json()
394
 
395
  text = data.get("text")
396
+ voice_id = data.get("voice_id", "bella")
397
+ speed = data.get("speed", 1.0)
 
 
398
 
399
  if not text:
400
  raise HTTPException(status_code=400, detail="Text is required")
 
402
  if len(text) > 2000:
403
  raise HTTPException(status_code=400, detail="Text exceeds 2000 character limit")
404
 
405
+ provider = get_kokoro_provider()
406
 
407
  # Validate voice
408
  voice_info = provider.get_voice_info(voice_id)
 
413
  audio_data = await provider.generate_speech(
414
  text=text,
415
  voice_id=voice_id,
416
+ speed=speed
 
 
417
  )
418
 
419
  if audio_data is None:
420
  raise HTTPException(
421
  status_code=500,
422
+ detail="Failed to generate speech."
423
  )
424
 
425
  return Response(
 
438
  )
439
 
440
 
441
+ @router.get("/v1/tts/kokoro/voices")
442
+ async def kokoro_voices(
443
  key_data: dict = Depends(verify_api_key)
444
  ):
445
  """
446
+ Get all available Kokoro voices with full details.
447
  """
448
+ provider = get_kokoro_provider()
449
  voices = provider.get_available_voices()
450
 
451
  return JSONResponse({
452
  "voices": voices,
453
  "count": len(voices),
454
+ "default_voice": "bella",
455
+ "provider": "kokoro",
456
+ "description": "Fast, natural TTS powered by Kokoro-82M"
457
  })
458
 
459
 
 
463
  Check if TTS service is healthy.
464
  """
465
  try:
466
+ provider = get_kokoro_provider()
467
  is_healthy = await provider.health_check()
468
 
469
  return JSONResponse({
470
  "status": "healthy" if is_healthy else "unhealthy",
471
+ "provider": "kokoro",
472
+ "model": "Kokoro-82M",
473
+ "description": "Fast, natural text-to-speech",
474
  "timestamp": time.time()
475
  })
476
  except Exception as e:
477
  return JSONResponse({
478
  "status": "unhealthy",
479
+ "provider": "kokoro",
480
  "error": str(e),
481
  "timestamp": time.time()
482
  }, status_code=503)