mgbam commited on
Commit
8d35b9c
·
verified ·
1 Parent(s): 90848ed

Upload voice_interface.py

Browse files
Files changed (1) hide show
  1. ui/voice_interface.py +168 -0
ui/voice_interface.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ElevenLabs Voice Interface - For $2K + AirPods Pro Prize
3
+
4
+ Voice-first enterprise AI interaction.
5
+ """
6
+
7
+ import os
8
+ from typing import Optional, AsyncGenerator
9
+ import asyncio
10
+
11
+ try:
12
+ from elevenlabs import ElevenLabs, VoiceSettings
13
+ from elevenlabs.client import AsyncElevenLabs
14
+ ELEVENLABS_AVAILABLE = True
15
+ except ImportError:
16
+ ELEVENLABS_AVAILABLE = False
17
+ print("[WARNING] ElevenLabs not installed")
18
+
19
+
20
+ class VoiceInterface:
21
+ """
22
+ Voice-first interface for OmniMind using ElevenLabs.
23
+
24
+ Prize Integration: ElevenLabs Category Award ($2K + AirPods Pro)
25
+ - Natural conversational AI
26
+ - Streaming voice responses
27
+ - Enterprise-grade voice quality
28
+ """
29
+
30
+ def __init__(self):
31
+ self.api_key = os.getenv("ELEVENLABS_API_KEY")
32
+
33
+ if not ELEVENLABS_AVAILABLE or not self.api_key:
34
+ self.client = None
35
+ print("[WARNING] ElevenLabs not configured")
36
+ return
37
+
38
+ self.client = AsyncElevenLabs(api_key=self.api_key)
39
+
40
+ # Voice configurations for different personas
41
+ self.voices = {
42
+ "professional": "ErXwobaYiN019PkySvjV", # Antoni - professional male
43
+ "friendly": "EXAVITQu4vr4xnSDxMaL", # Sarah - friendly female
44
+ "executive": "VR6AewLTigWG4xSOukaG", # Arnold - authoritative male
45
+ }
46
+
47
+ self.current_voice = "professional"
48
+
49
+ async def text_to_speech(
50
+ self,
51
+ text: str,
52
+ voice: str = "professional",
53
+ stream: bool = True
54
+ ) -> AsyncGenerator[bytes, None]:
55
+ """
56
+ Convert text to speech with streaming support.
57
+
58
+ Args:
59
+ text: Text to convert
60
+ voice: Voice persona (professional, friendly, executive)
61
+ stream: Stream audio chunks for real-time playback
62
+
63
+ Yields:
64
+ Audio chunks (bytes)
65
+ """
66
+ if not self.client:
67
+ # Return empty generator if not configured
68
+ return
69
+ yield
70
+
71
+ voice_id = self.voices.get(voice, self.voices["professional"])
72
+
73
+ if stream:
74
+ # Streaming for real-time responses
75
+ audio_stream = await self.client.text_to_speech.convert_as_stream(
76
+ text=text,
77
+ voice_id=voice_id,
78
+ model_id="eleven_turbo_v2_5", # Fastest model
79
+ voice_settings=VoiceSettings(
80
+ stability=0.5,
81
+ similarity_boost=0.75,
82
+ style=0.5,
83
+ use_speaker_boost=True
84
+ )
85
+ )
86
+
87
+ async for chunk in audio_stream:
88
+ yield chunk
89
+ else:
90
+ # Non-streaming for complete audio
91
+ audio = await self.client.text_to_speech.convert(
92
+ text=text,
93
+ voice_id=voice_id,
94
+ model_id="eleven_turbo_v2_5",
95
+ voice_settings=VoiceSettings(
96
+ stability=0.5,
97
+ similarity_boost=0.75,
98
+ style=0.5,
99
+ use_speaker_boost=True
100
+ )
101
+ )
102
+
103
+ yield audio
104
+
105
+ async def speech_to_text(self, audio_data: bytes) -> str:
106
+ """
107
+ Convert speech to text (using OpenAI Whisper as ElevenLabs doesn't have STT).
108
+
109
+ Args:
110
+ audio_data: Audio bytes (WAV format)
111
+
112
+ Returns:
113
+ Transcribed text
114
+ """
115
+ # ElevenLabs doesn't have STT, so we use OpenAI Whisper
116
+ from openai import AsyncOpenAI
117
+
118
+ openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
119
+
120
+ # Save audio temporarily
121
+ import tempfile
122
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
123
+ f.write(audio_data)
124
+ audio_path = f.name
125
+
126
+ try:
127
+ with open(audio_path, "rb") as audio_file:
128
+ transcript = await openai_client.audio.transcriptions.create(
129
+ model="whisper-1",
130
+ file=audio_file
131
+ )
132
+
133
+ return transcript.text
134
+
135
+ finally:
136
+ # Cleanup
137
+ import os
138
+ os.unlink(audio_path)
139
+
140
+ async def get_available_voices(self):
141
+ """Get list of available voices"""
142
+ if not self.client:
143
+ return {"status": "unavailable", "voices": []}
144
+
145
+ voices = await self.client.voices.get_all()
146
+
147
+ return {
148
+ "status": "success",
149
+ "voices": [
150
+ {
151
+ "voice_id": voice.voice_id,
152
+ "name": voice.name,
153
+ "category": voice.category
154
+ }
155
+ for voice in voices.voices
156
+ ]
157
+ }
158
+
159
+ def set_voice(self, voice_name: str):
160
+ """Set the current voice persona"""
161
+ if voice_name in self.voices:
162
+ self.current_voice = voice_name
163
+ return True
164
+ return False
165
+
166
+
167
+ # Global voice interface
168
+ voice = VoiceInterface()