Tim Luka Horstmann
commited on
Commit
·
964084b
1
Parent(s):
8d04f0d
Use gemini TTS
Browse files
app.py
CHANGED
|
@@ -18,6 +18,8 @@ import psutil # Added for RAM tracking
|
|
| 18 |
from google import genai
|
| 19 |
from google.genai import types
|
| 20 |
import httpx
|
|
|
|
|
|
|
| 21 |
from elevenlabs import ElevenLabs, VoiceSettings
|
| 22 |
from slowapi import Limiter, _rate_limit_exceeded_handler
|
| 23 |
from slowapi.util import get_remote_address
|
|
@@ -104,6 +106,11 @@ else:
|
|
| 104 |
elevenlabs_client = None
|
| 105 |
logger.info("ElevenLabs TTS disabled (no API key provided)")
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Define FAQs
|
| 108 |
faqs = [
|
| 109 |
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
|
|
@@ -549,48 +556,96 @@ async def predict(request: Request, query_request: QueryRequest):
|
|
| 549 |
@app.post("/api/tts")
|
| 550 |
@limiter.limit("5/minute") # Allow 5 TTS requests per minute per IP
|
| 551 |
async def text_to_speech(request: Request, tts_request: TTSRequest):
|
| 552 |
-
"""Convert text to speech using ElevenLabs API"""
|
| 553 |
-
if not elevenlabs_client:
|
| 554 |
-
raise HTTPException(status_code=503, detail="TTS service not available")
|
| 555 |
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
|
|
|
|
|
|
|
|
|
| 565 |
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
)
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
|
| 595 |
@app.get("/health")
|
| 596 |
@limiter.limit("30/minute") # Allow frequent health checks
|
|
|
|
| 18 |
from google import genai
|
| 19 |
from google.genai import types
|
| 20 |
import httpx
|
| 21 |
+
import wave
|
| 22 |
+
import io
|
| 23 |
from elevenlabs import ElevenLabs, VoiceSettings
|
| 24 |
from slowapi import Limiter, _rate_limit_exceeded_handler
|
| 25 |
from slowapi.util import get_remote_address
|
|
|
|
| 106 |
elevenlabs_client = None
|
| 107 |
logger.info("ElevenLabs TTS disabled (no API key provided)")
|
| 108 |
|
| 109 |
+
# TTS Configuration
|
| 110 |
+
tts_provider = os.getenv("TTS_PROVIDER", "elevenlabs").lower()
|
| 111 |
+
gemini_tts_model = os.getenv("GEMINI_TTS_MODEL", "gemini-2.5-flash-preview-tts")
|
| 112 |
+
gemini_tts_voice = os.getenv("GEMINI_TTS_VOICE", "Kore")
|
| 113 |
+
|
| 114 |
# Define FAQs
|
| 115 |
faqs = [
|
| 116 |
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
|
|
|
|
| 556 |
@app.post("/api/tts")
|
| 557 |
@limiter.limit("5/minute") # Allow 5 TTS requests per minute per IP
|
| 558 |
async def text_to_speech(request: Request, tts_request: TTSRequest):
|
| 559 |
+
"""Convert text to speech using ElevenLabs or Gemini API"""
|
|
|
|
|
|
|
| 560 |
|
| 561 |
+
# Clean the text for TTS (remove markdown and special characters)
|
| 562 |
+
clean_text = tts_request.text.replace("**", "").replace("*", "").replace("\n", " ").strip()
|
| 563 |
+
|
| 564 |
+
if not clean_text:
|
| 565 |
+
raise HTTPException(status_code=400, detail="No text provided for TTS")
|
| 566 |
+
|
| 567 |
+
if len(clean_text) > 1000: # Limit text length to avoid long processing times
|
| 568 |
+
clean_text = clean_text[:1000] + "..."
|
| 569 |
+
|
| 570 |
+
if tts_provider == "gemini":
|
| 571 |
+
if not gemini_client:
|
| 572 |
+
raise HTTPException(status_code=503, detail="Gemini TTS service not available (API key missing)")
|
| 573 |
|
| 574 |
+
try:
|
| 575 |
+
response = gemini_client.models.generate_content(
|
| 576 |
+
model=gemini_tts_model,
|
| 577 |
+
contents=clean_text,
|
| 578 |
+
config=types.GenerateContentConfig(
|
| 579 |
+
response_modalities=["AUDIO"],
|
| 580 |
+
speech_config=types.SpeechConfig(
|
| 581 |
+
voice_config=types.VoiceConfig(
|
| 582 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
| 583 |
+
voice_name=gemini_tts_voice,
|
| 584 |
+
)
|
| 585 |
+
)
|
| 586 |
+
),
|
| 587 |
+
)
|
| 588 |
)
|
| 589 |
+
|
| 590 |
+
# Get raw PCM data
|
| 591 |
+
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
| 592 |
+
|
| 593 |
+
# Convert PCM to WAV
|
| 594 |
+
wav_buffer = io.BytesIO()
|
| 595 |
+
with wave.open(wav_buffer, "wb") as wf:
|
| 596 |
+
wf.setnchannels(1)
|
| 597 |
+
wf.setsampwidth(2)
|
| 598 |
+
wf.setframerate(24000)
|
| 599 |
+
wf.writeframes(pcm_data)
|
| 600 |
+
|
| 601 |
+
audio_bytes = wav_buffer.getvalue()
|
| 602 |
+
|
| 603 |
+
return Response(
|
| 604 |
+
content=audio_bytes,
|
| 605 |
+
media_type="audio/wav",
|
| 606 |
+
headers={
|
| 607 |
+
"Content-Disposition": "inline; filename=tts_audio.wav",
|
| 608 |
+
"Cache-Control": "no-cache"
|
| 609 |
+
}
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
except Exception as e:
|
| 613 |
+
logger.error(f"Gemini TTS error: {str(e)}")
|
| 614 |
+
raise HTTPException(status_code=500, detail=f"Gemini TTS conversion failed: {str(e)}")
|
| 615 |
+
|
| 616 |
+
else:
|
| 617 |
+
if not elevenlabs_client:
|
| 618 |
+
raise HTTPException(status_code=503, detail="TTS service not available")
|
| 619 |
|
| 620 |
+
try:
|
| 621 |
+
# Generate speech
|
| 622 |
+
response = elevenlabs_client.text_to_speech.convert(
|
| 623 |
+
voice_id=tts_voice_id,
|
| 624 |
+
model_id="eleven_flash_v2_5",
|
| 625 |
+
text=clean_text,
|
| 626 |
+
voice_settings=VoiceSettings(
|
| 627 |
+
stability=0.7, # More stability = less variability; best: 0.7–0.85
|
| 628 |
+
similarity_boost=0.9, # Boost similarity to original voice
|
| 629 |
+
style=0.2, # Keep subtle emotion; increase for expressive output
|
| 630 |
+
use_speaker_boost=True # Helps preserve speaker identity better
|
| 631 |
+
)
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
# Convert generator to bytes
|
| 635 |
+
audio_bytes = b"".join(response)
|
| 636 |
+
|
| 637 |
+
return Response(
|
| 638 |
+
content=audio_bytes,
|
| 639 |
+
media_type="audio/mpeg",
|
| 640 |
+
headers={
|
| 641 |
+
"Content-Disposition": "inline; filename=tts_audio.mp3",
|
| 642 |
+
"Cache-Control": "no-cache"
|
| 643 |
+
}
|
| 644 |
+
)
|
| 645 |
+
|
| 646 |
+
except Exception as e:
|
| 647 |
+
logger.error(f"TTS error: {str(e)}")
|
| 648 |
+
raise HTTPException(status_code=500, detail=f"TTS conversion failed: {str(e)}")
|
| 649 |
|
| 650 |
@app.get("/health")
|
| 651 |
@limiter.limit("30/minute") # Allow frequent health checks
|