SJLee-0525 commited on
Commit
060d8ef
ยท
1 Parent(s): 18ef373

[TEST] test38

Browse files
.backend.pid DELETED
@@ -1 +0,0 @@
1
- 2909266
 
 
.env copy.example ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ์Œ์„ฑ ๊ฒ€์ฆ ์•ฑ ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •
2
+ # .env ํŒŒ์ผ๋กœ ๋ณต์‚ฌํ•ด์„œ ์‚ฌ์šฉํ•˜์„ธ์š”: cp .env.example .env
3
+
4
+ # ========== STT ์„ค์ • ==========
5
+ # STT ์ œ๊ณต์ž ์„ ํƒ: 'openai', 'google', 'azure', 'mock'
6
+ STT_PROVIDER=mock
7
+
8
+ # ========== OpenAI Whisper ==========
9
+ # OpenAI API ํ‚ค (https://platform.openai.com/api-keys)
10
+ # OPENAI_API_KEY=sk-your-api-key-here
11
+
12
+ # ========== Google Speech-to-Text ==========
13
+ # Google Cloud ์ธ์ฆ JSON ํŒŒ์ผ ๊ฒฝ๋กœ
14
+ # GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
15
+
16
+ # ========== Azure Speech Service ==========
17
+ # Azure Speech Service ํ‚ค ๋ฐ ๋ฆฌ์ „
18
+ # AZURE_SPEECH_KEY=your-azure-key
19
+ # AZURE_SPEECH_REGION=koreacentral
20
+
21
+ # Full Database URL
22
+ DATABASE_URL=
23
+ AI_SERVER_URL=
24
+
25
+ # Google Gemini API for LLM
26
+ GOOGLE_API_KEY=
27
+
28
+ # ========== ์„œ๋ฒ„ ์„ค์ • ==========
29
+ # ์„œ๋ฒ„ ํ˜ธ์ŠคํŠธ (0.0.0.0 = ๋ชจ๋“  ์ธํ„ฐํŽ˜์ด์Šค)
30
+ SERVER_HOST=0.0.0.0
31
+
32
+ # ๋ฐฑ์—”๋“œ ํฌํŠธ (FastAPI)
33
+ BACKEND_PORT=8000
34
+
35
+ # ํ”„๋ก ํŠธ์—”๋“œ ํฌํŠธ (Gradio)
36
+ FRONTEND_PORT=7860
37
+
38
+ # Backend API URL (๊ธฐ๋ณธ๊ฐ’: http://localhost:BACKEND_PORT)
39
+ # BACKEND_API_URL=http://localhost:8002
40
+
41
+ GRADIO_SHARE=false
42
+
43
+ # ========== ๊ฐœ๋ฐœ ๋ชจ๋“œ ==========
44
+ DEBUG=false
__pycache__ copy/backend.cpython-310.pyc ADDED
Binary file (26.4 kB). View file
 
__pycache__ copy/backend.cpython-311.pyc ADDED
Binary file (49.9 kB). View file
 
__pycache__ copy/backend.cpython-313.pyc ADDED
Binary file (39.7 kB). View file
 
__pycache__ copy/gemini_adapter.cpython-310.pyc ADDED
Binary file (11.1 kB). View file
 
__pycache__ copy/gemini_adapter.cpython-311.pyc ADDED
Binary file (20.3 kB). View file
 
__pycache__ copy/gemini_adapter.cpython-313.pyc ADDED
Binary file (13.6 kB). View file
 
backend.py ADDED
@@ -0,0 +1,1168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Backend for Komentle Voice Challenge
3
+ Handles voice analysis requests and communicates with AI server
4
+ """
5
+
6
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.staticfiles import StaticFiles
9
+ from pydantic import BaseModel
10
+ from typing import Optional, Dict
11
+ from datetime import datetime
12
+ from contextlib import asynccontextmanager, AsyncExitStack
13
+ import os
14
+ import time
15
+ import base64
16
+ import json
17
+ import asyncio
18
+ import hashlib
19
+ import io
20
+ from pathlib import Path
21
+ from dotenv import load_dotenv
22
+ from sqlalchemy import create_engine, text
23
+ import httpx
24
+ import logging
25
+ from mcp.client.sse import sse_client
26
+ from mcp.client.session import ClientSession
27
+ from gemini_adapter import call_gemini_with_tools, get_text_from_gemini_response
28
+ from pydub import AudioSegment
29
+ from pydub.effects import normalize
30
+
31
+ # Setup logging
32
+ logging.basicConfig(level=logging.INFO)
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Load environment variables
36
+ load_dotenv()
37
+
38
+ # Database connection
39
+ DATABASE_URL = os.getenv("DATABASE_URL")
40
+ engine = create_engine(
41
+ DATABASE_URL,
42
+ pool_size=10, # ๊ธฐ๋ณธ ์—ฐ๊ฒฐ ํ’€ ํฌ๊ธฐ
43
+ max_overflow=20, # ์ตœ๋Œ€ ์ถ”๊ฐ€ ์—ฐ๊ฒฐ ์ˆ˜
44
+ pool_pre_ping=True, # ์—ฐ๊ฒฐ ์‚ฌ์šฉ ์ „ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ
45
+ pool_recycle=3600, # 1์‹œ๊ฐ„๋งˆ๋‹ค ์—ฐ๊ฒฐ ์žฌ์ƒ์„ฑ
46
+ connect_args={
47
+ "connect_timeout": 10, # ์—ฐ๊ฒฐ ํƒ€์ž„์•„์›ƒ 10์ดˆ
48
+ "options": "-c statement_timeout=30000" # ์ฟผ๋ฆฌ ํƒ€์ž„์•„์›ƒ 30์ดˆ
49
+ }
50
+ )
51
+
52
+ # AI Server URL (ํ™˜๊ฒฝ๋ณ€์ˆ˜๋กœ ๊ด€๋ฆฌ) - No longer used, replaced with direct MCP integration
53
+ AI_SERVER_URL = os.getenv("AI_SERVER_URL")
54
+
55
+ # Global VoiceKit MCP session
56
+ voicekit_session = None
57
+ session_stack = None
58
+ mcp_lock = None # Lock for MCP reconnection
59
+
60
+ # Session tracking for attempt counts
61
+ session_attempts = {} # {session_id: attempt_count}
62
+
63
+ # Session tracking for hint/advice history (to avoid repetition)
64
+ session_hint_history = {} # {session_id: [list of previous hints/advice]}
65
+
66
+ # VoiceKit result cache (audio_hash -> scores)
67
+ voicekit_result_cache = {} # {hash: {"scores": dict, "timestamp": float}}
68
+ VOICEKIT_CACHE_TTL = 3600 # 1 hour TTL
69
+
70
+
71
+ async def reconnect_voicekit_mcp():
72
+ """Reconnect to VoiceKit MCP when connection is lost"""
73
+ global voicekit_session, session_stack, mcp_lock
74
+
75
+ if mcp_lock is None:
76
+ import asyncio
77
+ mcp_lock = asyncio.Lock()
78
+
79
+ async with mcp_lock:
80
+ # Check if already reconnected by another call
81
+ if voicekit_session is not None:
82
+ try:
83
+ # Test if session is alive
84
+ await voicekit_session.list_tools()
85
+ logger.info("MCP session already alive, no reconnection needed")
86
+ return
87
+ except:
88
+ pass
89
+
90
+ logger.info("Reconnecting to VoiceKit MCP...")
91
+
92
+ # Clean up old session
93
+ if session_stack:
94
+ try:
95
+ await session_stack.aclose()
96
+ except:
97
+ pass
98
+
99
+ # Create new session
100
+ session_stack = AsyncExitStack()
101
+ try:
102
+ voicekit_url = "https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse"
103
+ read, write = await session_stack.enter_async_context(sse_client(voicekit_url))
104
+ voicekit_session = await session_stack.enter_async_context(
105
+ ClientSession(read, write)
106
+ )
107
+ await voicekit_session.initialize()
108
+
109
+ tools_result = await voicekit_session.list_tools()
110
+ logger.info(
111
+ f"โœ“ VoiceKit MCP reconnected. Tools: {[t.name for t in tools_result.tools]}"
112
+ )
113
+ except Exception as e:
114
+ logger.error(f"Failed to reconnect VoiceKit MCP: {e}")
115
+ voicekit_session = None
116
+ raise
117
+
118
+
119
+ def get_audio_hash(audio_bytes: bytes, reference_b64: str, answer_word: str, category: str) -> str:
120
+ """Generate hash for audio caching key"""
121
+ # Combine user audio + reference audio + answer + category for unique key
122
+ cache_key = f"{hashlib.sha256(audio_bytes).hexdigest()}_{reference_b64[:50]}_{answer_word}_{category}"
123
+ return hashlib.sha256(cache_key.encode()).hexdigest()
124
+
125
+
126
+ def compress_audio(audio_bytes: bytes, target_sample_rate: int = 16000) -> bytes:
127
+ """
128
+ Compress audio to reduce size for faster MCP transmission
129
+
130
+ Args:
131
+ audio_bytes: Original audio bytes
132
+ target_sample_rate: Target sample rate (default 16kHz for voice)
133
+
134
+ Returns:
135
+ Compressed audio bytes
136
+ """
137
+ try:
138
+ compress_start = time.time()
139
+ original_size = len(audio_bytes)
140
+
141
+ # Load audio using pydub
142
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
143
+
144
+ # Convert to mono (voice doesn't need stereo)
145
+ if audio.channels > 1:
146
+ audio = audio.set_channels(1)
147
+
148
+ # Downsample to 16kHz (optimal for voice recognition)
149
+ if audio.frame_rate != target_sample_rate:
150
+ audio = audio.set_frame_rate(target_sample_rate)
151
+
152
+ # Normalize audio levels
153
+ audio = normalize(audio)
154
+
155
+ # Strip silence from beginning/end (threshold -50dB)
156
+ audio = audio.strip_silence(silence_thresh=-50, padding=100)
157
+
158
+ # Export as compressed WAV (16-bit PCM)
159
+ output_buffer = io.BytesIO()
160
+ audio.export(output_buffer, format="wav", parameters=["-ac", "1", "-ar", str(target_sample_rate)])
161
+ compressed_bytes = output_buffer.getvalue()
162
+
163
+ compressed_size = len(compressed_bytes)
164
+ reduction = (1 - compressed_size / original_size) * 100
165
+ compress_time = (time.time() - compress_start) * 1000
166
+
167
+ logger.info(
168
+ f"๐Ÿ—œ๏ธ Audio compression: {original_size/1024:.1f}KB โ†’ {compressed_size/1024:.1f}KB "
169
+ f"({reduction:.1f}% reduction) in {compress_time:.1f}ms"
170
+ )
171
+
172
+ return compressed_bytes
173
+
174
+ except Exception as e:
175
+ logger.warning(f"Audio compression failed: {e}, using original")
176
+ return audio_bytes
177
+
178
+
179
+ # Lifespan handler for MCP initialization
180
+ @asynccontextmanager
181
+ async def lifespan(app: FastAPI):
182
+ """Initialize and cleanup VoiceKit MCP connection"""
183
+ global voicekit_session, session_stack
184
+
185
+ # Startup: Initialize VoiceKit MCP
186
+ logger.info("Initializing VoiceKit MCP connection...")
187
+ session_stack = AsyncExitStack()
188
+
189
+ try:
190
+ voicekit_url = "https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse"
191
+ read, write = await session_stack.enter_async_context(sse_client(voicekit_url))
192
+ voicekit_session = await session_stack.enter_async_context(
193
+ ClientSession(read, write)
194
+ )
195
+ await voicekit_session.initialize()
196
+
197
+ # List available tools
198
+ tools_result = await voicekit_session.list_tools()
199
+ logger.info(
200
+ f"โœ“ VoiceKit MCP connected. Tools: {[t.name for t in tools_result.tools]}"
201
+ )
202
+
203
+ except Exception as e:
204
+ logger.error(f"Failed to initialize VoiceKit MCP: {e}")
205
+ voicekit_session = None
206
+
207
+ yield
208
+
209
+ # Shutdown: cleanup
210
+ if session_stack:
211
+ await session_stack.aclose()
212
+ logger.info("โœ“ VoiceKit MCP connection closed")
213
+
214
+
215
+ app = FastAPI(title="Komentle Voice API", lifespan=lifespan)
216
+
217
+ # CORS ์„ค์ •
218
+ app.add_middleware(
219
+ CORSMiddleware,
220
+ allow_origins=["*"],
221
+ allow_credentials=True,
222
+ allow_methods=["*"],
223
+ allow_headers=["*"],
224
+ )
225
+
226
+ # Mount static files for hint images
227
+ app.mount("/images", StaticFiles(directory="images"), name="images")
228
+
229
+ # Mount static files for reference audio
230
+ app.mount("/reference_audio", StaticFiles(directory="reference_audio"), name="reference_audio")
231
+
232
+
233
+ # ============================================================================
234
+ # Performance Optimization: Caches
235
+ # ============================================================================
236
+
237
+ # Cache for base64-encoded reference audio (key: puzzle_number)
238
+ reference_audio_cache = {}
239
+
240
+ # Cache for Gemini-generated hints (key: cache_key from attempt+scores)
241
+ hint_cache = {}
242
+
243
+
244
+ # ============================================================================
245
+ # Audio Format Handling: Multi-format fallback
246
+ # ============================================================================
247
+
248
+ def load_reference_audio_with_fallback(reference_audio_path: str, puzzle_number: Optional[int] = None) -> Optional[str]:
249
+ """
250
+ Load reference audio with multi-format fallback and caching
251
+
252
+ Args:
253
+ reference_audio_path: Path from database (may be wrong extension)
254
+ puzzle_number: Puzzle number for caching (optional)
255
+
256
+ Returns:
257
+ base64-encoded audio string or None if not found
258
+ """
259
+ # Check cache first (if puzzle_number provided)
260
+ if puzzle_number is not None and puzzle_number in reference_audio_cache:
261
+ logger.info(f"โœ“ Using cached reference audio for puzzle #{puzzle_number}")
262
+ return reference_audio_cache[puzzle_number]
263
+
264
+ if not reference_audio_path:
265
+ return None
266
+
267
+ # Get base path without extension
268
+ base_path = Path(reference_audio_path.lstrip("/"))
269
+ base_name = base_path.stem
270
+ parent_dir = base_path.parent
271
+
272
+ # Try exact path first
273
+ if base_path.exists():
274
+ with open(base_path, "rb") as f:
275
+ audio_b64 = base64.b64encode(f.read()).decode("utf-8")
276
+ if puzzle_number is not None:
277
+ reference_audio_cache[puzzle_number] = audio_b64
278
+ logger.info(f"โœ“ Loaded reference audio: {base_path}")
279
+ return audio_b64
280
+
281
+ # Try alternative formats (priority: .wav for VoiceKit, then others)
282
+ for ext in ['.wav', '.mp3', '.m4a', '.ogg', '.flac']:
283
+ alt_path = parent_dir / f"{base_name}{ext}"
284
+ if alt_path.exists():
285
+ with open(alt_path, "rb") as f:
286
+ audio_b64 = base64.b64encode(f.read()).decode("utf-8")
287
+ if puzzle_number is not None:
288
+ reference_audio_cache[puzzle_number] = audio_b64
289
+ logger.info(f"โœ“ Loaded reference audio (alternative format): {alt_path}")
290
+ return audio_b64
291
+
292
+ logger.warning(f"โš  No reference audio found for: {reference_audio_path}")
293
+ return None
294
+
295
+
296
+ def get_hint_cache_key(attempt: int, scores: dict, category: str) -> str:
297
+ """
298
+ Generate cache key for Gemini hints based on attempt and score buckets
299
+
300
+ Args:
301
+ attempt: Attempt number (1-6)
302
+ scores: Dict of scores {pitch, rhythm, energy, pronunciation, transcript}
303
+ category: Puzzle category
304
+
305
+ Returns:
306
+ Cache key string
307
+ """
308
+ def bucket(score):
309
+ """Bucket scores into low/medium/high"""
310
+ if score < 30:
311
+ return "low"
312
+ elif score < 70:
313
+ return "med"
314
+ else:
315
+ return "high"
316
+
317
+ # Filter only numeric score fields (exclude 'user_text' and other non-numeric fields)
318
+ numeric_scores = {k: v for k, v in scores.items() if isinstance(v, (int, float))}
319
+
320
+ # Find weakest 2 aspects
321
+ weakest = sorted(numeric_scores.items(), key=lambda x: x[1])[:2]
322
+ weakest_str = "_".join([f"{k}:{bucket(v)}" for k, v in weakest])
323
+
324
+ return f"{category}_attempt{attempt}_{weakest_str}"
325
+
326
+
327
+ # Response models
328
+ class AnalysisResponse(BaseModel):
329
+ status: str
330
+ category: str
331
+ answer_word: Optional[str] = None # Answer word for chatbot context
332
+ reference_audio_path: Optional[str] = None # Reference audio for voice cloning
333
+ pitch: float
334
+ rhythm: float
335
+ energy: float
336
+ pronunciation: float
337
+ transcript: float
338
+ overall: float
339
+ advice: Optional[str] = None
340
+ is_correct: bool = False
341
+ message: Optional[str] = None
342
+ user_text: Optional[str] = None # ์‚ฌ์šฉ์ž๊ฐ€ ๋งํ•œ ํ…์ŠคํŠธ (STT ๊ฒฐ๊ณผ)
343
+
344
+
345
+ class ErrorResponse(BaseModel):
346
+ status: str
347
+ message: str
348
+
349
+
350
+ def get_puzzle_by_date(date: str) -> Optional[Dict]:
351
+ """
352
+ Query puzzle by date from database
353
+
354
+ Args:
355
+ date: Date string in YYYY-MM-DD format
356
+
357
+ Returns:
358
+ dict: Puzzle info or None if not found
359
+ """
360
+ try:
361
+ query = text(
362
+ """
363
+ SELECT puzzle_number, answer_word, puzzle_date, difficulty, category, reference_audio_path
364
+ FROM puzzles
365
+ WHERE puzzle_date = :date
366
+ LIMIT 1
367
+ """
368
+ )
369
+
370
+ with engine.connect() as connection:
371
+ result = connection.execute(query, {"date": date})
372
+ row = result.fetchone()
373
+
374
+ if row:
375
+ return {
376
+ "puzzle_number": row[0],
377
+ "answer_word": row[1],
378
+ "puzzle_date": str(row[2]),
379
+ "difficulty": row[3],
380
+ "category": row[4],
381
+ "reference_audio_path": row[5],
382
+ }
383
+ return None
384
+
385
+ except Exception as e:
386
+ logger.error(f"Database error: {e}")
387
+ return None
388
+
389
+
390
+ def get_attempt_count(session_id: str) -> int:
391
+ """Track and increment attempt count for session"""
392
+ global session_attempts
393
+ if session_id not in session_attempts:
394
+ session_attempts[session_id] = 0
395
+ session_attempts[session_id] += 1
396
+ return session_attempts[session_id]
397
+
398
+
399
+ def list_hint_files(category: str) -> list:
400
+ """List available hint images for category"""
401
+ hints_dir = Path("images/hints") / category
402
+ if not hints_dir.exists():
403
+ return []
404
+ extensions = ["*.jpg", "*.png", "*.jpeg", "*.gif"]
405
+ files = []
406
+ for ext in extensions:
407
+ files.extend([f.name for f in hints_dir.glob(ext)])
408
+ return files
409
+
410
+
411
+ async def generate_hints_with_gemini(
412
+ scores: dict, attempt: int, answer_word: str, category: str, user_text: str = "",
413
+ hint_history: list = None
414
+ ) -> dict:
415
+ """Generate JSON hints using Gemini LLM (caching disabled to ensure unique hints)"""
416
+ try:
417
+ # TEMPORARY FIX: Disable hint caching to prevent repetitive hints
418
+ # The cache was too aggressive, returning identical hints when scores were similar
419
+ # TODO: Revisit with smarter caching strategy (e.g., include previous hint hash)
420
+ # cache_key = get_hint_cache_key(attempt, scores, category)
421
+ # if cache_key in hint_cache:
422
+ # logger.info(f"โœ“ Using cached hint for: {cache_key}")
423
+ # return hint_cache[cache_key]
424
+
425
+ # Find weakest metrics
426
+ metrics = {
427
+ k: v
428
+ for k, v in scores.items()
429
+ if k in ["pitch", "rhythm", "energy", "pronunciation"]
430
+ }
431
+ weakest = sorted(metrics.items(), key=lambda x: x[1])[:2]
432
+ weakest_names = [w[0] for w in weakest]
433
+
434
+ # List available hint files
435
+ available_hints = list_hint_files(category)
436
+ hint_files_str = (
437
+ ", ".join(available_hints[:5]) if available_hints else "none available"
438
+ )
439
+
440
+ # Determine hint type and guidance based on attempt (progressive difficulty)
441
+ if attempt == 1:
442
+ hint_type = "hint"
443
+ guidance = f"Give an EXTREMELY VAGUE clue. Don't mention the category yet. Just hint at the general concept."
444
+ category_hint = "Do NOT mention the category on first attempt."
445
+ elif attempt == 2:
446
+ hint_type = "hint"
447
+ guidance = f"Give a VAGUE clue and casually mention it's a {category}. Include an image hint if available."
448
+ category_hint = f"Mention it's a {category} but keep the clue vague."
449
+ elif attempt <= 4:
450
+ hint_type = "hint"
451
+ guidance = f"Give a MORE SPECIFIC clue about this {category}. Include relevant context. Use image if available."
452
+ category_hint = f"Be clear this is a {category} and add more context."
453
+ elif attempt <= 6:
454
+ hint_type = "hint"
455
+ guidance = f"Give a QUITE SPECIFIC hint about this {category}. Can mention era, context, or usage. Include image if helpful."
456
+ category_hint = f"Give substantial clues while still not revealing the answer."
457
+ elif attempt <= 10:
458
+ hint_type = "hint"
459
+ guidance = f"Give VERY SPECIFIC hints. Can mention syllable count, rhymes, or first letter. This is attempt {attempt} - be helpful!"
460
+ category_hint = f"User has tried {attempt} times. Give strong hints without saying the answer."
461
+ else:
462
+ hint_type = "advice"
463
+ guidance = f"Attempt {attempt}! Focus on pronunciation coaching for {', '.join(weakest_names)}. Give very strong hints about what to say."
464
+ category_hint = f"After {attempt} attempts, be very helpful while still not directly revealing the answer."
465
+
466
+ # Build prompt for Gemini
467
+ overall_score = scores.get('overall', 0)
468
+
469
+ # Format hint history for prompt (avoid repetition)
470
+ history_text = ""
471
+ if hint_history and len(hint_history) > 0:
472
+ recent_hints = hint_history[-5:] # Last 5 hints only
473
+ history_text = "\n".join([f" - {h}" for h in recent_hints])
474
+
475
+ prompt = f"""You generate hints for a pronunciation game. User tries to guess and say a secret phrase.
476
+
477
+ CONTEXT:
478
+ - User said: "{user_text}"
479
+ - Secret answer: "{answer_word}" (NEVER reveal!)
480
+ - Category: {category}
481
+ - Overall score: {overall_score}/100
482
+ - Attempt: {attempt}
483
+
484
+ PREVIOUS HINTS GIVEN (DO NOT REPEAT THESE - give NEW information!):
485
+ {history_text if history_text else " (none yet)"}
486
+
487
+ MANDATORY FORMAT: Always start with "You said '[what user said]' - " then your feedback.
488
+
489
+ RULES:
490
+ 1. If overall >= 70: User is saying the RIGHT phrase. Give pronunciation tips.
491
+ โ†’ "You said 'Wingardium Leviosa' - Correct! Work on your pitch - try more dramatic."
492
+
493
+ 2. If overall < 70: User is saying the WRONG phrase. Analyze what they said and GUIDE them:
494
+ - Same franchise? โ†’ "You said 'Shut up Malfoy' - Right franchise! Now think of a famous SPELL..."
495
+ - Similar category? โ†’ "You said 'I'll be back' - Good movie instinct! But try a magical fantasy..."
496
+ - Unrelated? โ†’ "You said 'Hello' - That's not it. This is a famous {category}..."
497
+
498
+ 3. NEVER say "focus on pronunciation" when overall < 70!
499
+
500
+ 4. Be helpful based on attempt ({attempt}): 1-3 vague, 4-6 specific, 7+ very helpful.
501
+
502
+ 5. NEVER repeat hints from the history above! Always give NEW, FRESH information.
503
+
504
+ Return ONLY this JSON:
505
+ {{"type": "{hint_type}", "answer": [{{"text": "You said '...' - your feedback", "path": ""}}]}}
506
+ """
507
+
508
+ # Call Gemini
509
+ print(f"[GEMINI HINT] Calling Gemini for hint generation...")
510
+ print(f"[GEMINI HINT] User said: '{user_text}', Overall: {overall_score}, Attempt: {attempt}")
511
+ print(f"[GEMINI HINT] Hint history: {hint_history}")
512
+
513
+ response = call_gemini_with_tools(
514
+ model_name="gemini-2.5-flash",
515
+ system_prompt="You are a JSON generator. Return ONLY valid JSON with no markdown formatting or extra text.",
516
+ messages=[{"role": "user", "content": prompt}],
517
+ tools=[],
518
+ max_tokens=2048, # Generous limit for hint generation with history
519
+ )
520
+
521
+ # Extract JSON from response
522
+ response_text, error = get_text_from_gemini_response(response)
523
+ print(f"[GEMINI HINT] Response text: {response_text[:200] if response_text else 'None'}...")
524
+ print(f"[GEMINI HINT] Error: {error}")
525
+
526
+ if error:
527
+ logger.error(f"Gemini response error: {error}")
528
+ print(f"[GEMINI HINT] โŒ FALLBACK triggered due to error: {error}")
529
+ return {
530
+ "type": "advice",
531
+ "answer": [{"text": f"Keep trying! This is a famous {category}.", "path": ""}]
532
+ }
533
+
534
+ # Clean response text (remove markdown code blocks if present)
535
+ response_text = response_text.strip()
536
+ if response_text.startswith("```"):
537
+ lines = response_text.split("\n")
538
+ response_text = "\n".join(lines[1:-1]) if len(lines) > 2 else response_text
539
+ print(f"[GEMINI HINT] Cleaned markdown, result: {response_text[:200]}...")
540
+
541
+ # Parse JSON
542
+ print(f"[GEMINI HINT] Parsing JSON: {response_text[:300]}...")
543
+ hints_json = json.loads(response_text)
544
+ print(f"[GEMINI HINT] โœ“ Parsed successfully: {hints_json}")
545
+
546
+ # DISABLED: Don't cache hints to ensure unique hints per attempt
547
+ # hint_cache[cache_key] = hints_json
548
+ # logger.info(f"โœ“ Cached hint for: {cache_key}")
549
+ logger.info(f"โœ“ Generated fresh hint for attempt {attempt}, category {category}")
550
+
551
+ return hints_json
552
+
553
+ except Exception as e:
554
+ logger.error(f"Hint generation error: {e}")
555
+ print(f"[GEMINI HINT] โŒ EXCEPTION: {type(e).__name__}: {e}")
556
+ import traceback
557
+ traceback.print_exc()
558
+ return {
559
+ "type": "advice",
560
+ "answer": [{"text": f"Keep trying! This is a famous {category}.", "path": ""}]
561
+ }
562
+
563
+
564
+ def extract_advice_text(hints_json: dict) -> str:
565
+ """Extract plain text from hints JSON for advice field"""
566
+ try:
567
+ return " ".join([item["text"] for item in hints_json.get("answer", [])])
568
+ except:
569
+ return "Keep practicing!"
570
+
571
+
572
+ async def call_ai_server(
573
+ audio_file: bytes,
574
+ session_id: str,
575
+ category: str,
576
+ answer_word: str,
577
+ reference_audio_path: Optional[str] = None,
578
+ puzzle_number: Optional[int] = None,
579
+ ) -> Dict:
580
+ """
581
+ Analyze voice using VoiceKit MCP + Gemini for hints
582
+
583
+ Args:
584
+ audio_file: Audio file bytes
585
+ session_id: User session ID
586
+ category: Puzzle category (meme, movie, song)
587
+ answer_word: Correct answer for this puzzle
588
+ reference_audio_path: Path to reference audio file (from DB)
589
+ puzzle_number: Puzzle number for caching reference audio
590
+
591
+ Returns:
592
+ dict: AI analysis results with pitch, rhythm, energy, pronounciation, transcript, overall_score, advice, hints, is_correct
593
+ """
594
+ try:
595
+ start_time = time.time()
596
+
597
+ if not voicekit_session:
598
+ logger.error("VoiceKit MCP not initialized")
599
+ return {"error": "AI service not available"}
600
+
601
+ # Compress audio before processing (reduces size by 50-70%)
602
+ compressed_audio = compress_audio(audio_file)
603
+
604
+ # Convert audio bytes to base64
605
+ user_b64 = base64.b64encode(compressed_audio).decode("utf-8")
606
+ logger.info(f"โฑ๏ธ Base64 encoding: {(time.time() - start_time)*1000:.1f}ms")
607
+
608
+ # Load reference audio (ground truth) with format fallback and caching
609
+ ref_start = time.time()
610
+ reference_b64 = load_reference_audio_with_fallback(reference_audio_path, puzzle_number)
611
+ logger.info(f"โฑ๏ธ Reference audio load: {(time.time() - ref_start)*1000:.1f}ms")
612
+
613
+ if reference_b64 is None:
614
+ # Fallback: use user audio as reference if GT not available
615
+ reference_b64 = user_b64
616
+ logger.warning("โš  No reference audio available, using user audio")
617
+
618
+ # Track attempt count
619
+ attempt = get_attempt_count(session_id)
620
+ logger.info(f"Session {session_id}: Attempt {attempt}/6")
621
+
622
+ # DISABLED: VoiceKit result caching
623
+ # Users may submit the same audio multiple times intentionally for practice
624
+ # Each submission should be analyzed fresh to provide real-time feedback
625
+ # audio_hash = get_audio_hash(compressed_audio, reference_b64 or "", answer_word, category)
626
+ result = None
627
+
628
+ # Call VoiceKit MCP for voice analysis with retry logic and timeout (always fresh)
629
+ if result is None:
630
+ max_retries = 3
631
+ timeout_seconds = 20 # Balance between reliability and user wait time (60s max)
632
+ voicekit_start = time.time()
633
+ for retry in range(max_retries):
634
+ try:
635
+ logger.info(f"Calling VoiceKit MCP (attempt {retry + 1}/{max_retries})...")
636
+ result = await asyncio.wait_for(
637
+ voicekit_session.call_tool(
638
+ "voicekit_analyze_voice_similarity",
639
+ {
640
+ "user_audio_base64": user_b64,
641
+ "reference_audio_base64": reference_b64,
642
+ "reference_text": answer_word,
643
+ "category": category,
644
+ },
645
+ ),
646
+ timeout=timeout_seconds
647
+ )
648
+ voicekit_time = (time.time() - voicekit_start) * 1000
649
+ logger.info(f"โœ“ VoiceKit MCP call successful")
650
+ logger.info(f"โฑ๏ธ VoiceKit MCP call: {voicekit_time:.1f}ms")
651
+
652
+ # DISABLED: Don't cache VoiceKit results
653
+ # Each user submission should be analyzed fresh
654
+ # voicekit_result_cache[audio_hash] = {
655
+ # "result": result,
656
+ # "timestamp": time.time()
657
+ # }
658
+ logger.info(f"โœ“ Fresh VoiceKit analysis completed")
659
+
660
+ break # Success, exit retry loop
661
+ except asyncio.TimeoutError:
662
+ if retry < max_retries - 1:
663
+ # Exponential backoff: 0.5s, 1s, 2s
664
+ retry_delay = 0.5 * (2 ** retry)
665
+ logger.warning(
666
+ f"VoiceKit call timed out after {timeout_seconds}s (attempt {retry + 1}/{max_retries}), retrying in {retry_delay}s"
667
+ )
668
+ await asyncio.sleep(retry_delay)
669
+ else:
670
+ logger.error(
671
+ f"VoiceKit call timed out after {max_retries} attempts"
672
+ )
673
+ raise HTTPException(status_code=504, detail="VoiceKit service timeout")
674
+ except Exception as e:
675
+ error_msg = str(e)
676
+ # Check if MCP connection is closed
677
+ if "ClosedResourceError" in error_msg or "ClosedResourceError" in str(type(e)):
678
+ logger.warning(f"MCP connection closed, attempting to reconnect...")
679
+ try:
680
+ await reconnect_voicekit_mcp()
681
+ logger.info("MCP reconnected, retrying request...")
682
+ await asyncio.sleep(1)
683
+ continue # Retry with new connection
684
+ except Exception as reconnect_error:
685
+ logger.error(f"MCP reconnection failed: {reconnect_error}")
686
+
687
+ if retry < max_retries - 1:
688
+ # Exponential backoff: 0.5s, 1s, 2s
689
+ retry_delay = 0.5 * (2 ** retry)
690
+ logger.warning(
691
+ f"VoiceKit call failed (attempt {retry + 1}/{max_retries}): {e}, retrying in {retry_delay}s"
692
+ )
693
+ await asyncio.sleep(retry_delay)
694
+ else:
695
+ logger.error(
696
+ f"VoiceKit call failed after {max_retries} attempts: {e}"
697
+ )
698
+ raise
699
+
700
+ # Parse VoiceKit response
701
+ scores_text = result.content[0].text
702
+ scores = json.loads(scores_text)
703
+ # scores = {pitch, rhythm, energy, pronunciation, transcript, overall}
704
+
705
+ logger.info(f"VoiceKit scores: {scores}")
706
+
707
+ print(f"\n{'='*50}")
708
+ print(f"[AI RESPONSE] VoiceKit MCP ์‘๋‹ต ๋ฐ์ดํ„ฐ:")
709
+ print(f" Raw text: {scores_text}")
710
+ print(f" Parsed scores:")
711
+ for key, value in scores.items():
712
+ print(f" - {key}: {value}")
713
+ print(f"{'='*50}\n")
714
+
715
+ # Get hint history for this session (to avoid repetition)
716
+ global session_hint_history
717
+ if session_id not in session_hint_history:
718
+ session_hint_history[session_id] = []
719
+ hint_history = session_hint_history[session_id]
720
+
721
+ # Generate hints with Gemini (including user's spoken text for context-aware advice)
722
+ gemini_start = time.time()
723
+ hints_json = await generate_hints_with_gemini(
724
+ scores=scores, attempt=attempt, answer_word=answer_word, category=category,
725
+ user_text=scores.get("user_text", ""),
726
+ hint_history=hint_history
727
+ )
728
+ gemini_time = (time.time() - gemini_start) * 1000
729
+ logger.info(f"โฑ๏ธ Gemini hint generation: {gemini_time:.1f}ms")
730
+ logger.info(f"Generated hints: {hints_json}")
731
+
732
+ # Store this hint in history (for next time)
733
+ new_hint = extract_advice_text(hints_json)
734
+ if new_hint and new_hint != "Keep practicing!":
735
+ session_hint_history[session_id].append(new_hint)
736
+ # Keep only last 10 hints to avoid memory bloat
737
+ if len(session_hint_history[session_id]) > 10:
738
+ session_hint_history[session_id] = session_hint_history[session_id][-10:]
739
+
740
+ print(f"\n{'='*50}")
741
+ print(f"[AI RESPONSE] Gemini ํžŒํŠธ ์‘๋‹ต ๋ฐ์ดํ„ฐ:")
742
+ print(f" {json.dumps(hints_json, ensure_ascii=False, indent=2)}")
743
+ print(f"{'='*50}\n")
744
+
745
+ # Total time
746
+ total_time = (time.time() - start_time) * 1000
747
+ logger.info(f"โฑ๏ธ TOTAL REQUEST TIME: {total_time:.1f}ms")
748
+
749
+ # Format response (convert 0-100 to 0.0-1.0 as Chloe expects)
750
+ return {
751
+ "pitch": scores.get("pitch", 0) / 100.0,
752
+ "rhythm": scores.get("rhythm", 0) / 100.0,
753
+ "energy": scores.get("energy", 0) / 100.0,
754
+ "pronounciation": scores.get("pronunciation", 0)
755
+ / 100.0, # Note: typo to match Chloe's expectation
756
+ "transcript": scores.get("transcript", 0) / 100.0,
757
+ "overall_score": scores.get("overall", 0) / 100.0,
758
+ "advice": extract_advice_text(hints_json),
759
+ "hints": hints_json,
760
+ "is_correct": scores.get("overall", 0) > 85,
761
+ "user_text": scores.get("user_text", ""), # STT ๊ฒฐ๊ณผ
762
+ }
763
+
764
+ except Exception as e:
765
+ logger.error(f"AI analysis error: {e}")
766
+ import traceback
767
+
768
+ traceback.print_exc()
769
+ return {"error": str(e)}
770
+
771
+
772
+ def convert_to_percentage(value: float) -> float:
773
+ """
774
+ Convert AI score (0.0-1.0) to percentage (0-100)
775
+
776
+ Args:
777
+ value: Score in 0.0-1.0 range
778
+
779
+ Returns:
780
+ float: Score in 0-100 range
781
+ """
782
+ return round(value * 100, 1)
783
+
784
+
785
+ def save_guess_record(
786
+ session_id: str,
787
+ puzzle_number: int,
788
+ pitch: float,
789
+ rhythm: float,
790
+ energy: float,
791
+ pronunciation: float,
792
+ transcript: float,
793
+ overall: float,
794
+ advice: str,
795
+ is_correct: bool,
796
+ user_text: str = "",
797
+ ) -> bool:
798
+ """
799
+ Save guess record to database
800
+
801
+ Args:
802
+ session_id: User session UUID
803
+ puzzle_number: Puzzle number
804
+ pitch: Pitch score (0-100)
805
+ rhythm: Rhythm score (0-100)
806
+ energy: Energy score (0-100)
807
+ pronunciation: Pronunciation score (0-100)
808
+ transcript: Transcript score (0-100)
809
+ overall: Overall score (0-100)
810
+ advice: AI advice
811
+ is_correct: Whether answer is correct
812
+ user_text: STT transcription from MCP
813
+
814
+ Returns:
815
+ bool: True if saved successfully, False otherwise
816
+ """
817
+ try:
818
+ # ํ˜„์žฌ ์‹œ๊ฐ์„ ๋ฐ€๋ฆฌ์ดˆ ๋‹จ์œ„ timestamp๋กœ ๋ณ€ํ™˜
819
+ guess_timestamp = int(time.time() * 1000)
820
+
821
+ query = text(
822
+ """
823
+ INSERT INTO guess_records
824
+ (session_id, puzzle_number, pitch, rhythm, energy, pronunciation,
825
+ transcript, overall, advice, is_correct, guess_timestamp, user_text)
826
+ VALUES
827
+ (:session_id, :puzzle_number, :pitch, :rhythm, :energy, :pronunciation,
828
+ :transcript, :overall, :advice, :is_correct, :guess_timestamp, :user_text)
829
+ """
830
+ )
831
+
832
+ with engine.connect() as connection:
833
+ connection.execute(
834
+ query,
835
+ {
836
+ "session_id": session_id,
837
+ "puzzle_number": puzzle_number,
838
+ "pitch": pitch,
839
+ "rhythm": rhythm,
840
+ "energy": energy,
841
+ "pronunciation": pronunciation,
842
+ "transcript": transcript,
843
+ "overall": overall,
844
+ "advice": advice,
845
+ "is_correct": is_correct,
846
+ "guess_timestamp": guess_timestamp,
847
+ "user_text": user_text,
848
+ },
849
+ )
850
+ connection.commit()
851
+
852
+ logger.info(
853
+ f"Saved guess record: session={session_id}, puzzle={puzzle_number}, correct={is_correct}"
854
+ )
855
+ return True
856
+
857
+ except Exception as e:
858
+ logger.error(f"Failed to save guess record: {e}")
859
+ return False
860
+
861
+
862
+ @app.get("/")
863
+ async def root():
864
+ """Health check endpoint"""
865
+ return {"status": "ok", "message": "Komentle Voice API"}
866
+
867
+
868
+ @app.get("/health")
869
+ async def health_check():
870
+ """Detailed health check"""
871
+ db_status = "ok"
872
+ try:
873
+ with engine.connect() as connection:
874
+ connection.execute(text("SELECT 1"))
875
+ except Exception as e:
876
+ db_status = f"error: {str(e)}"
877
+
878
+ return {
879
+ "status": "ok",
880
+ "database": db_status,
881
+ "timestamp": datetime.now().isoformat(),
882
+ }
883
+
884
+
885
+ async def analyze_voice_logic(audio_bytes: bytes, date: str, session_id: str) -> Dict:
886
+ """
887
+ Core logic for voice analysis (can be called directly or via API)
888
+
889
+ Args:
890
+ audio_bytes: Audio file bytes
891
+ date: Date in YYYY-MM-DD format
892
+ session_id: User session UUID
893
+
894
+ Returns:
895
+ dict: Analysis results with scores
896
+ """
897
+ logger.info(f"Received request: date={date}, session_id={session_id}")
898
+
899
+ # 1. Get puzzle for the date
900
+ puzzle = get_puzzle_by_date(date)
901
+ if not puzzle:
902
+ return {"status": "error", "message": f"No puzzle found for date: {date}"}
903
+
904
+ logger.info(f"Found puzzle: {puzzle['puzzle_number']} - {puzzle['category']}")
905
+
906
+ # 2. Call AI server with session_id, category, answer_word, reference_audio_path, and puzzle_number
907
+ ai_response = await call_ai_server(
908
+ audio_bytes,
909
+ session_id,
910
+ puzzle["category"],
911
+ puzzle["answer_word"],
912
+ puzzle.get("reference_audio_path"),
913
+ puzzle["puzzle_number"],
914
+ )
915
+
916
+ if "error" in ai_response:
917
+ return {
918
+ "status": "error",
919
+ "message": f"AI server error: {ai_response['error']}",
920
+ }
921
+
922
+ # 3. Convert scores to percentage (0-100) and map fields
923
+ # AI ์„œ๋ฒ„๋Š” 0.0-1.0 ๋ฒ”์œ„๋กœ ๋ฐ˜ํ™˜, ํ”„๋ก ํŠธ์—”๋“œ๋Š” 0-100 ํ•„์š”
924
+ pitch = convert_to_percentage(ai_response.get("pitch", 0.0))
925
+ rhythm = convert_to_percentage(ai_response.get("rhythm", 0.0))
926
+ energy = convert_to_percentage(ai_response.get("energy", 0.0))
927
+ pronunciation = convert_to_percentage(
928
+ ai_response.get("pronounciation", 0.0)
929
+ ) # AI ์„œ๋ฒ„ ์˜คํƒ€: pronounciation
930
+ transcript = convert_to_percentage(
931
+ ai_response.get("transcript", 0.0)
932
+ ) # ๋Œ€์‚ฌ ์ •ํ™•๋„ ์ ์ˆ˜
933
+ overall = convert_to_percentage(ai_response.get("overall_score", 0.0))
934
+
935
+ advice = ai_response.get("advice", "")
936
+ is_correct = ai_response.get("is_correct", False) # AI๊ฐ€ ํŒ๋‹จํ•œ ์ •๋‹ต ์—ฌ๋ถ€
937
+ user_text = ai_response.get("user_text", "") # ์‚ฌ์šฉ์ž๊ฐ€ ๋งํ•œ ํ…์ŠคํŠธ (STT ๊ฒฐ๊ณผ)
938
+
939
+ # 4. Save guess record to database
940
+ save_guess_record(
941
+ session_id=session_id,
942
+ puzzle_number=puzzle["puzzle_number"],
943
+ pitch=pitch,
944
+ rhythm=rhythm,
945
+ energy=energy,
946
+ pronunciation=pronunciation,
947
+ transcript=transcript,
948
+ overall=overall,
949
+ advice=advice,
950
+ is_correct=is_correct,
951
+ user_text=user_text,
952
+ )
953
+
954
+ logger.info(
955
+ f"Analysis complete: category={puzzle['category']}, overall={overall}, correct={is_correct}"
956
+ )
957
+
958
+ result = {
959
+ "status": "success",
960
+ "category": puzzle["category"],
961
+ "answer_word": puzzle["answer_word"], # Add answer for chatbot context
962
+ "reference_audio_path": puzzle.get("reference_audio_path"), # For TTS voice cloning
963
+ "pitch": pitch,
964
+ "rhythm": rhythm,
965
+ "energy": energy,
966
+ "pronunciation": pronunciation,
967
+ "transcript": transcript,
968
+ "overall": overall,
969
+ "advice": advice,
970
+ "is_correct": is_correct,
971
+ "user_text": user_text,
972
+ }
973
+
974
+ print(f"\n{'='*50}")
975
+ print(f"[SCORING RESULT] analyze_voice_logic ๋ฆฌํ„ด๊ฐ’:")
976
+ print(f" - status: {result['status']}")
977
+ print(f" - category: {result['category']}")
978
+ print(f" - pitch: {result['pitch']}")
979
+ print(f" - rhythm: {result['rhythm']}")
980
+ print(f" - energy: {result['energy']}")
981
+ print(f" - pronunciation: {result['pronunciation']}")
982
+ print(f" - transcript: {result['transcript']}")
983
+ print(f" - overall: {result['overall']}")
984
+ print(f" - is_correct: {result['is_correct']}")
985
+ print(f" - user_text: {result['user_text']}")
986
+ print(f" - advice: {result['advice'][:100]}..." if len(result['advice']) > 100 else f" - advice: {result['advice']}")
987
+ print(f"{'='*50}\n")
988
+
989
+ return result
990
+
991
+
992
+ @app.post("/api/analyze-voice", response_model=AnalysisResponse)
993
+ async def analyze_voice(
994
+ audio: UploadFile = File(...), date: str = Form(...), session_id: str = Form(...)
995
+ ):
996
+ """
997
+ Analyze user voice recording (API endpoint)
998
+
999
+ Args:
1000
+ audio: Audio file (WAV format)
1001
+ date: Date in YYYY-MM-DD format
1002
+ session_id: User session UUID
1003
+
1004
+ Returns:
1005
+ AnalysisResponse: Analysis results with scores
1006
+ """
1007
+ # Read audio file
1008
+ audio_bytes = await audio.read()
1009
+
1010
+ # Call core logic
1011
+ result = await analyze_voice_logic(audio_bytes, date, session_id)
1012
+
1013
+ # Handle errors
1014
+ if result.get("status") == "error":
1015
+ raise HTTPException(
1016
+ status_code=500, detail=result.get("message", "Unknown error")
1017
+ )
1018
+
1019
+ return AnalysisResponse(**result)
1020
+
1021
+
1022
+ @app.get("/api/puzzle/{date}")
1023
+ async def get_puzzle(date: str):
1024
+ """
1025
+ Get puzzle information for a specific date
1026
+
1027
+ Args:
1028
+ date: Date in YYYY-MM-DD format
1029
+
1030
+ Returns:
1031
+ dict: Puzzle information
1032
+ """
1033
+ puzzle = get_puzzle_by_date(date)
1034
+ if not puzzle:
1035
+ raise HTTPException(status_code=404, detail=f"No puzzle found for date: {date}")
1036
+
1037
+ # Don't expose answer_word to frontend
1038
+ return {
1039
+ "puzzle_number": puzzle["puzzle_number"],
1040
+ "puzzle_date": puzzle["puzzle_date"],
1041
+ "difficulty": puzzle["difficulty"],
1042
+ "category": puzzle["category"],
1043
+ }
1044
+
1045
+
1046
+ @app.get("/api/dashboard")
1047
+ async def get_dashboard():
1048
+ """
1049
+ ์ „์ฒด ๋Œ€์‹œ๋ณด๋“œ ๋ฐ์ดํ„ฐ ์กฐํšŒ (์˜ค๋Š˜ + ์ „์ฒด ํ†ตํ•ฉ)
1050
+
1051
+ Returns:
1052
+ dict: Flat dashboard statistics with 6 key metrics
1053
+ """
1054
+ try:
1055
+ today = datetime.now().strftime("%Y-%m-%d")
1056
+ today_answer = text(
1057
+ """
1058
+ SELECT
1059
+ answer_word,
1060
+ reference_audio_path,
1061
+ category,
1062
+ difficulty,
1063
+ puzzle_date
1064
+ FROM puzzles
1065
+ WHERE puzzle_date = :today
1066
+ LIMIT 1
1067
+ """
1068
+ )
1069
+
1070
+ # ์˜ค๋Š˜์˜ ํ†ต๊ณ„ ์กฐํšŒ
1071
+ today_query = text(
1072
+ """
1073
+ SELECT
1074
+ puzzle_date,
1075
+ participants,
1076
+ success_rate,
1077
+ total_attempts
1078
+ FROM daily_statistics
1079
+ WHERE puzzle_date = :today
1080
+ """
1081
+ )
1082
+
1083
+ # ์ „์ฒด ํ†ต๊ณ„ ์กฐํšŒ
1084
+ overall_query = text(
1085
+ """
1086
+ SELECT
1087
+ total_participants,
1088
+ overall_success_rate,
1089
+ total_attempts,
1090
+ total_puzzles
1091
+ FROM overall_statistics
1092
+ """
1093
+ )
1094
+
1095
+ with engine.connect() as connection:
1096
+ # ์˜ค๋Š˜์˜ ํผ์ฆ ์ •๋ณด
1097
+ answer_result = connection.execute(today_answer, {"today": today})
1098
+ answer_row = answer_result.fetchone()
1099
+
1100
+ if answer_row:
1101
+ answer_word = answer_row[0]
1102
+ reference_audio_path = answer_row[1]
1103
+ category = answer_row[2]
1104
+ difficulty = answer_row[3]
1105
+ else:
1106
+ answer_word = None
1107
+ reference_audio_path = None
1108
+ category = None
1109
+ difficulty = None
1110
+
1111
+ # ์˜ค๋Š˜ ํ†ต๊ณ„
1112
+ today_result = connection.execute(today_query, {"today": today})
1113
+ today_row = today_result.fetchone()
1114
+
1115
+ if not today_row:
1116
+ today_participants = 0
1117
+ today_success_rate = 0.0
1118
+ today_attempts = 0
1119
+ else:
1120
+ today_participants = today_row[1]
1121
+ today_success_rate = float(today_row[2])
1122
+ today_attempts = today_row[3]
1123
+
1124
+ # ์ „์ฒด ํ†ต๊ณ„
1125
+ overall_result = connection.execute(overall_query)
1126
+ overall_row = overall_result.fetchone()
1127
+
1128
+ if not overall_row:
1129
+ total_participants = 0
1130
+ total_success_rate = 0.0
1131
+ total_attempts = 0
1132
+ total_puzzles = 0
1133
+ else:
1134
+ total_participants = overall_row[0]
1135
+ total_success_rate = float(overall_row[1])
1136
+ total_attempts = overall_row[2]
1137
+ total_puzzles = overall_row[3]
1138
+
1139
+ return {
1140
+ # ์˜ค๋Š˜ ํ†ต๊ณ„
1141
+ "today_participants": today_participants, # 1. ์˜ค๋Š˜ ์ฐธ์—ฌ์ž ์ˆ˜
1142
+ "today_success_rate": today_success_rate, # 2. ์˜ค๋Š˜ ์ •๋‹ต๋ฅ 
1143
+ "today_attempts": today_attempts, # 5. ์˜ค๋Š˜ ์‹œ๋„ ํšŸ์ˆ˜
1144
+ # ์ „์ฒด ํ†ต๊ณ„
1145
+ "total_participants": total_participants, # 3. ์ „์ฒด ์ฐธ์—ฌ์ž ์ˆ˜
1146
+ "total_success_rate": total_success_rate, # 4. ์ „์ฒด ์ •๋‹ต๋ฅ 
1147
+ "total_attempts": total_attempts, # 6. ์ด ์‹œ๋„ ํšŸ์ˆ˜
1148
+ # ์˜ค๋Š˜์˜ ํผ์ฆ ์ •๋ณด
1149
+ "answer_word": answer_word,
1150
+ "reference_audio_path": reference_audio_path,
1151
+ "category": category,
1152
+ "difficulty": difficulty,
1153
+ # ์ถ”๊ฐ€ ์ •๋ณด
1154
+ "date": today,
1155
+ "total_puzzles": total_puzzles,
1156
+ }
1157
+
1158
+ except Exception as e:
1159
+ logger.error(f"Failed to get dashboard: {e}")
1160
+ raise HTTPException(status_code=500, detail="Failed to retrieve dashboard")
1161
+
1162
+
1163
+ if __name__ == "__main__":
1164
+ import uvicorn
1165
+
1166
+ host = os.getenv("SERVER_HOST")
1167
+ port = int(os.getenv("BACKEND_PORT"))
1168
+ uvicorn.run(app, host=host, port=port, log_level="info")
client/frontend/components/floating_chatbot.py CHANGED
@@ -59,7 +59,7 @@ from gemini_adapter import chat_with_gemini, chat_with_gemini_and_tools
59
  # [๋ฐฉ์‹ 2] Gemini ๊ธฐ๋ฐ˜ AI ์ฑ—๋ด‡ (game_state ์ปจํ…์ŠคํŠธ ํฌํ•จ)
60
  # ============================================================
61
 
62
- def build_system_prompt_from_game_state(game_state: Optional[Dict]) -> str:
63
  """
64
  game_state์—์„œ ํžŒํŠธ ํžˆ์Šคํ† ๋ฆฌ๋ฅผ ์ถ”์ถœํ•˜์—ฌ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
65
 
@@ -229,7 +229,8 @@ Greet them warmly and help them understand the game:
229
  context_parts.append(f"- You can reference the category and give contextual clues")
230
 
231
  # Add audio hint capability info (Phase 2 + Tool Calling)
232
- if is_elevenlabs_configured():
 
233
  context_parts.append(f"\n### AUDIO HINT TOOL (Use ONLY when explicitly requested)")
234
  context_parts.append(f"- You have a tool called `generate_audio_hint` that generates TTS audio")
235
  context_parts.append(f"- ONLY call this tool when the user EXPLICITLY asks for audio hints:")
@@ -356,8 +357,21 @@ def call_ai_backend(message: str, user_id: str, history: List[Dict], game_state:
356
  answer_word = ai_analysis.get('answerWord', '')
357
  reference_audio_path = ai_analysis.get('referenceAudioPath', '')
358
 
359
- # game_state์—์„œ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
360
- system_prompt = build_system_prompt_from_game_state(game_state)
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
  print(f"[CHATBOT] Calling Gemini with context:")
363
  print(f" - User ID: {user_id}")
@@ -365,23 +379,15 @@ def call_ai_backend(message: str, user_id: str, history: List[Dict], game_state:
365
  print(f" - History length: {len(history)}")
366
  print(f" - Game state attempts: {attempt_count}")
367
  print(f" - Answer word: {answer_word}")
368
- # Debug ElevenLabs status
369
- from utils.elevenlabs_tts import get_api_key, ELEVENLABS_AVAILABLE
370
- api_key = get_api_key()
371
  print(f" - ElevenLabs AVAILABLE: {ELEVENLABS_AVAILABLE}")
372
  print(f" - ElevenLabs API key set: {bool(api_key)}")
373
- print(f" - ElevenLabs configured: {is_elevenlabs_configured()}")
374
 
375
  # Define audio hint tool if ElevenLabs is configured
376
  tools = []
377
- elevenlabs_ready = is_elevenlabs_configured()
378
- print(f"[CHATBOT] ElevenLabs ready: {elevenlabs_ready}, Answer word: '{answer_word}'")
379
-
380
- # Only enable audio tool if user EXPLICITLY asks for audio in THIS message
381
- audio_keywords = ['audio', 'play', 'sound', 'hear', 'listen', 'tts', 'pronounce', '๋“ค๋ ค', '๋ฐœ์Œ']
382
- user_wants_audio = any(kw in message.lower() for kw in audio_keywords)
383
 
384
- if elevenlabs_ready and answer_word and user_wants_audio:
385
  # Get word count for tool description
386
  word_count = len(answer_word.split())
387
  tools = [{
 
59
  # [๋ฐฉ์‹ 2] Gemini ๊ธฐ๋ฐ˜ AI ์ฑ—๋ด‡ (game_state ์ปจํ…์ŠคํŠธ ํฌํ•จ)
60
  # ============================================================
61
 
62
+ def build_system_prompt_from_game_state(game_state: Optional[Dict], include_audio_tool: bool = False) -> str:
63
  """
64
  game_state์—์„œ ํžŒํŠธ ํžˆ์Šคํ† ๋ฆฌ๋ฅผ ์ถ”์ถœํ•˜์—ฌ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
65
 
 
229
  context_parts.append(f"- You can reference the category and give contextual clues")
230
 
231
  # Add audio hint capability info (Phase 2 + Tool Calling)
232
+ # Only mention the tool when it will actually be available to avoid UNEXPECTED_TOOL_CALL errors
233
+ if include_audio_tool:
234
  context_parts.append(f"\n### AUDIO HINT TOOL (Use ONLY when explicitly requested)")
235
  context_parts.append(f"- You have a tool called `generate_audio_hint` that generates TTS audio")
236
  context_parts.append(f"- ONLY call this tool when the user EXPLICITLY asks for audio hints:")
 
357
  answer_word = ai_analysis.get('answerWord', '')
358
  reference_audio_path = ai_analysis.get('referenceAudioPath', '')
359
 
360
+ # Debug ElevenLabs status
361
+ from utils.elevenlabs_tts import get_api_key, ELEVENLABS_AVAILABLE
362
+ api_key = get_api_key()
363
+ elevenlabs_ready = is_elevenlabs_configured()
364
+
365
+ # LLM-driven approach: Enable audio tool when technically available
366
+ # Let Gemini decide when to USE it based on user intent (system prompt guides this)
367
+ # This is better than keyword-based gating because:
368
+ # - Handles any phrasing naturally ("speak it", "let me hear", etc.)
369
+ # - Understands context (won't trigger on "I lost my voice")
370
+ # - Works across languages without maintaining keyword lists
371
+ tools_will_be_enabled = elevenlabs_ready and bool(answer_word)
372
+
373
+ # game_state์—์„œ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (with audio tool info only if tools will be enabled)
374
+ system_prompt = build_system_prompt_from_game_state(game_state, include_audio_tool=tools_will_be_enabled)
375
 
376
  print(f"[CHATBOT] Calling Gemini with context:")
377
  print(f" - User ID: {user_id}")
 
379
  print(f" - History length: {len(history)}")
380
  print(f" - Game state attempts: {attempt_count}")
381
  print(f" - Answer word: {answer_word}")
 
 
 
382
  print(f" - ElevenLabs AVAILABLE: {ELEVENLABS_AVAILABLE}")
383
  print(f" - ElevenLabs API key set: {bool(api_key)}")
384
+ print(f" - ElevenLabs configured: {elevenlabs_ready}")
385
 
386
  # Define audio hint tool if ElevenLabs is configured
387
  tools = []
388
+ print(f"[CHATBOT] ElevenLabs ready: {elevenlabs_ready}, Answer word: '{answer_word}', Tools enabled: {tools_will_be_enabled}")
 
 
 
 
 
389
 
390
+ if tools_will_be_enabled:
391
  # Get word count for tool description
392
  word_count = len(answer_word.split())
393
  tools = [{
client/services/analysis_service.py CHANGED
@@ -103,7 +103,8 @@ async def analyze_voice(audio_bytes: bytes, date: str, session_id: str) -> Dict:
103
  logger.info(f"Generated hints: {hints_json}")
104
 
105
  advice = extract_advice_text(hints_json)
106
- is_correct = overall > 85
 
107
 
108
  # 6. Store this hint in history (for next time)
109
  add_hint_to_history(session_id, advice)
 
103
  logger.info(f"Generated hints: {hints_json}")
104
 
105
  advice = extract_advice_text(hints_json)
106
+ # Require non-empty user_text AND score > 85 for correct answer
107
+ is_correct = bool(user_text and user_text.strip()) and overall > 85
108
 
109
  # 6. Store this hint in history (for next time)
110
  add_hint_to_history(session_id, advice)
requirements.txt CHANGED
@@ -33,7 +33,7 @@ httpx==0.28.1
33
  httpx-sse==0.4.3
34
  huggingface_hub==1.1.5
35
  idna==3.11
36
- ipython==8.18.1
37
  ipython_pygments_lexers==1.1.1
38
  jedi==0.19.2
39
  Jinja2==3.1.6
 
33
  httpx-sse==0.4.3
34
  huggingface_hub==1.1.5
35
  idna==3.11
36
+ ipython==9.7.0
37
  ipython_pygments_lexers==1.1.1
38
  jedi==0.19.2
39
  Jinja2==3.1.6