Tim Luka Horstmann commited on
Commit
964084b
·
1 Parent(s): 8d04f0d

Use gemini TTS

Browse files
Files changed (1) hide show
  1. app.py +93 -38
app.py CHANGED
@@ -18,6 +18,8 @@ import psutil # Added for RAM tracking
18
  from google import genai
19
  from google.genai import types
20
  import httpx
 
 
21
  from elevenlabs import ElevenLabs, VoiceSettings
22
  from slowapi import Limiter, _rate_limit_exceeded_handler
23
  from slowapi.util import get_remote_address
@@ -104,6 +106,11 @@ else:
104
  elevenlabs_client = None
105
  logger.info("ElevenLabs TTS disabled (no API key provided)")
106
 
 
 
 
 
 
107
  # Define FAQs
108
  faqs = [
109
  {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
@@ -549,48 +556,96 @@ async def predict(request: Request, query_request: QueryRequest):
549
  @app.post("/api/tts")
550
  @limiter.limit("5/minute") # Allow 5 TTS requests per minute per IP
551
  async def text_to_speech(request: Request, tts_request: TTSRequest):
552
- """Convert text to speech using ElevenLabs API"""
553
- if not elevenlabs_client:
554
- raise HTTPException(status_code=503, detail="TTS service not available")
555
 
556
- try:
557
- # Clean the text for TTS (remove markdown and special characters)
558
- clean_text = tts_request.text.replace("**", "").replace("*", "").replace("\n", " ").strip()
559
-
560
- if not clean_text:
561
- raise HTTPException(status_code=400, detail="No text provided for TTS")
562
-
563
- if len(clean_text) > 1000: # Limit text length to avoid long processing times
564
- clean_text = clean_text[:1000] + "..."
 
 
 
565
 
566
- # Generate speech
567
- response = elevenlabs_client.text_to_speech.convert(
568
- voice_id=tts_voice_id,
569
- model_id="eleven_flash_v2_5",
570
- text=clean_text,
571
- voice_settings=VoiceSettings(
572
- stability=0.7, # More stability = less variability; best: 0.7–0.85
573
- similarity_boost=0.9, # Boost similarity to original voice
574
- style=0.2, # Keep subtle emotion; increase for expressive output
575
- use_speaker_boost=True # Helps preserve speaker identity better
 
 
 
 
576
  )
577
- )
578
-
579
- # Convert generator to bytes
580
- audio_bytes = b"".join(response)
581
-
582
- return Response(
583
- content=audio_bytes,
584
- media_type="audio/mpeg",
585
- headers={
586
- "Content-Disposition": "inline; filename=tts_audio.mp3",
587
- "Cache-Control": "no-cache"
588
- }
589
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590
 
591
- except Exception as e:
592
- logger.error(f"TTS error: {str(e)}")
593
- raise HTTPException(status_code=500, detail=f"TTS conversion failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
 
595
  @app.get("/health")
596
  @limiter.limit("30/minute") # Allow frequent health checks
 
18
  from google import genai
19
  from google.genai import types
20
  import httpx
21
+ import wave
22
+ import io
23
  from elevenlabs import ElevenLabs, VoiceSettings
24
  from slowapi import Limiter, _rate_limit_exceeded_handler
25
  from slowapi.util import get_remote_address
 
106
  elevenlabs_client = None
107
  logger.info("ElevenLabs TTS disabled (no API key provided)")
108
 
109
+ # TTS Configuration
110
+ tts_provider = os.getenv("TTS_PROVIDER", "elevenlabs").lower()
111
+ gemini_tts_model = os.getenv("GEMINI_TTS_MODEL", "gemini-2.5-flash-preview-tts")
112
+ gemini_tts_voice = os.getenv("GEMINI_TTS_VOICE", "Kore")
113
+
114
  # Define FAQs
115
  faqs = [
116
  {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
 
556
  @app.post("/api/tts")
557
  @limiter.limit("5/minute") # Allow 5 TTS requests per minute per IP
558
  async def text_to_speech(request: Request, tts_request: TTSRequest):
559
+ """Convert text to speech using ElevenLabs or Gemini API"""
 
 
560
 
561
+ # Clean the text for TTS (remove markdown and special characters)
562
+ clean_text = tts_request.text.replace("**", "").replace("*", "").replace("\n", " ").strip()
563
+
564
+ if not clean_text:
565
+ raise HTTPException(status_code=400, detail="No text provided for TTS")
566
+
567
+ if len(clean_text) > 1000: # Limit text length to avoid long processing times
568
+ clean_text = clean_text[:1000] + "..."
569
+
570
+ if tts_provider == "gemini":
571
+ if not gemini_client:
572
+ raise HTTPException(status_code=503, detail="Gemini TTS service not available (API key missing)")
573
 
574
+ try:
575
+ response = gemini_client.models.generate_content(
576
+ model=gemini_tts_model,
577
+ contents=clean_text,
578
+ config=types.GenerateContentConfig(
579
+ response_modalities=["AUDIO"],
580
+ speech_config=types.SpeechConfig(
581
+ voice_config=types.VoiceConfig(
582
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
583
+ voice_name=gemini_tts_voice,
584
+ )
585
+ )
586
+ ),
587
+ )
588
  )
589
+
590
+ # Get raw PCM data
591
+ pcm_data = response.candidates[0].content.parts[0].inline_data.data
592
+
593
+ # Convert PCM to WAV
594
+ wav_buffer = io.BytesIO()
595
+ with wave.open(wav_buffer, "wb") as wf:
596
+ wf.setnchannels(1)
597
+ wf.setsampwidth(2)
598
+ wf.setframerate(24000)
599
+ wf.writeframes(pcm_data)
600
+
601
+ audio_bytes = wav_buffer.getvalue()
602
+
603
+ return Response(
604
+ content=audio_bytes,
605
+ media_type="audio/wav",
606
+ headers={
607
+ "Content-Disposition": "inline; filename=tts_audio.wav",
608
+ "Cache-Control": "no-cache"
609
+ }
610
+ )
611
+
612
+ except Exception as e:
613
+ logger.error(f"Gemini TTS error: {str(e)}")
614
+ raise HTTPException(status_code=500, detail=f"Gemini TTS conversion failed: {str(e)}")
615
+
616
+ else:
617
+ if not elevenlabs_client:
618
+ raise HTTPException(status_code=503, detail="TTS service not available")
619
 
620
+ try:
621
+ # Generate speech
622
+ response = elevenlabs_client.text_to_speech.convert(
623
+ voice_id=tts_voice_id,
624
+ model_id="eleven_flash_v2_5",
625
+ text=clean_text,
626
+ voice_settings=VoiceSettings(
627
+ stability=0.7, # More stability = less variability; best: 0.7–0.85
628
+ similarity_boost=0.9, # Boost similarity to original voice
629
+ style=0.2, # Keep subtle emotion; increase for expressive output
630
+ use_speaker_boost=True # Helps preserve speaker identity better
631
+ )
632
+ )
633
+
634
+ # Convert generator to bytes
635
+ audio_bytes = b"".join(response)
636
+
637
+ return Response(
638
+ content=audio_bytes,
639
+ media_type="audio/mpeg",
640
+ headers={
641
+ "Content-Disposition": "inline; filename=tts_audio.mp3",
642
+ "Cache-Control": "no-cache"
643
+ }
644
+ )
645
+
646
+ except Exception as e:
647
+ logger.error(f"TTS error: {str(e)}")
648
+ raise HTTPException(status_code=500, detail=f"TTS conversion failed: {str(e)}")
649
 
650
  @app.get("/health")
651
  @limiter.limit("30/minute") # Allow frequent health checks