|
|
""" |
|
|
FastAPI Backend for Komentle Voice Challenge |
|
|
Handles voice analysis requests and communicates with AI server |
|
|
""" |
|
|
|
|
|
from fastapi import FastAPI, UploadFile, File, Form, HTTPException |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from fastapi.staticfiles import StaticFiles |
|
|
from pydantic import BaseModel |
|
|
from typing import Optional, Dict |
|
|
from datetime import datetime |
|
|
from contextlib import asynccontextmanager, AsyncExitStack |
|
|
import os |
|
|
import time |
|
|
import base64 |
|
|
import json |
|
|
import asyncio |
|
|
import hashlib |
|
|
import io |
|
|
from pathlib import Path |
|
|
from dotenv import load_dotenv |
|
|
from sqlalchemy import create_engine, text |
|
|
import httpx |
|
|
import logging |
|
|
from mcp.client.sse import sse_client |
|
|
from mcp.client.session import ClientSession |
|
|
from gemini_adapter import call_gemini_with_tools, get_text_from_gemini_response |
|
|
from pydub import AudioSegment |
|
|
from pydub.effects import normalize |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
DATABASE_URL = os.getenv("DATABASE_URL") |
|
|
engine = create_engine( |
|
|
DATABASE_URL, |
|
|
pool_size=10, |
|
|
max_overflow=20, |
|
|
pool_pre_ping=True, |
|
|
pool_recycle=3600, |
|
|
connect_args={ |
|
|
"connect_timeout": 10, |
|
|
"options": "-c statement_timeout=30000" |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
AI_SERVER_URL = os.getenv("AI_SERVER_URL") |
|
|
|
|
|
|
|
|
voicekit_session = None |
|
|
session_stack = None |
|
|
mcp_lock = None |
|
|
|
|
|
|
|
|
session_attempts = {} |
|
|
|
|
|
|
|
|
voicekit_result_cache = {} |
|
|
VOICEKIT_CACHE_TTL = 3600 |
|
|
|
|
|
|
|
|
async def reconnect_voicekit_mcp(): |
|
|
"""Reconnect to VoiceKit MCP when connection is lost""" |
|
|
global voicekit_session, session_stack, mcp_lock |
|
|
|
|
|
if mcp_lock is None: |
|
|
import asyncio |
|
|
mcp_lock = asyncio.Lock() |
|
|
|
|
|
async with mcp_lock: |
|
|
|
|
|
if voicekit_session is not None: |
|
|
try: |
|
|
|
|
|
await voicekit_session.list_tools() |
|
|
logger.info("MCP session already alive, no reconnection needed") |
|
|
return |
|
|
except: |
|
|
pass |
|
|
|
|
|
logger.info("Reconnecting to VoiceKit MCP...") |
|
|
|
|
|
|
|
|
if session_stack: |
|
|
try: |
|
|
await session_stack.aclose() |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
session_stack = AsyncExitStack() |
|
|
try: |
|
|
voicekit_url = "https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse" |
|
|
read, write = await session_stack.enter_async_context(sse_client(voicekit_url)) |
|
|
voicekit_session = await session_stack.enter_async_context( |
|
|
ClientSession(read, write) |
|
|
) |
|
|
await voicekit_session.initialize() |
|
|
|
|
|
tools_result = await voicekit_session.list_tools() |
|
|
logger.info( |
|
|
f"β VoiceKit MCP reconnected. Tools: {[t.name for t in tools_result.tools]}" |
|
|
) |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to reconnect VoiceKit MCP: {e}") |
|
|
voicekit_session = None |
|
|
raise |
|
|
|
|
|
|
|
|
def get_audio_hash(audio_bytes: bytes, reference_b64: str, answer_word: str, category: str) -> str: |
|
|
"""Generate hash for audio caching key""" |
|
|
|
|
|
cache_key = f"{hashlib.sha256(audio_bytes).hexdigest()}_{reference_b64[:50]}_{answer_word}_{category}" |
|
|
return hashlib.sha256(cache_key.encode()).hexdigest() |
|
|
|
|
|
|
|
|
def compress_audio(audio_bytes: bytes, target_sample_rate: int = 16000) -> bytes: |
|
|
""" |
|
|
Compress audio to reduce size for faster MCP transmission |
|
|
|
|
|
Args: |
|
|
audio_bytes: Original audio bytes |
|
|
target_sample_rate: Target sample rate (default 16kHz for voice) |
|
|
|
|
|
Returns: |
|
|
Compressed audio bytes |
|
|
""" |
|
|
try: |
|
|
compress_start = time.time() |
|
|
original_size = len(audio_bytes) |
|
|
|
|
|
|
|
|
audio = AudioSegment.from_file(io.BytesIO(audio_bytes)) |
|
|
|
|
|
|
|
|
if audio.channels > 1: |
|
|
audio = audio.set_channels(1) |
|
|
|
|
|
|
|
|
if audio.frame_rate != target_sample_rate: |
|
|
audio = audio.set_frame_rate(target_sample_rate) |
|
|
|
|
|
|
|
|
audio = normalize(audio) |
|
|
|
|
|
|
|
|
audio = audio.strip_silence(silence_thresh=-50, padding=100) |
|
|
|
|
|
|
|
|
output_buffer = io.BytesIO() |
|
|
audio.export(output_buffer, format="wav", parameters=["-ac", "1", "-ar", str(target_sample_rate)]) |
|
|
compressed_bytes = output_buffer.getvalue() |
|
|
|
|
|
compressed_size = len(compressed_bytes) |
|
|
reduction = (1 - compressed_size / original_size) * 100 |
|
|
compress_time = (time.time() - compress_start) * 1000 |
|
|
|
|
|
logger.info( |
|
|
f"ποΈ Audio compression: {original_size/1024:.1f}KB β {compressed_size/1024:.1f}KB " |
|
|
f"({reduction:.1f}% reduction) in {compress_time:.1f}ms" |
|
|
) |
|
|
|
|
|
return compressed_bytes |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Audio compression failed: {e}, using original") |
|
|
return audio_bytes |
|
|
|
|
|
|
|
|
|
|
|
@asynccontextmanager |
|
|
async def lifespan(app: FastAPI): |
|
|
"""Initialize and cleanup VoiceKit MCP connection""" |
|
|
global voicekit_session, session_stack |
|
|
|
|
|
|
|
|
logger.info("Initializing VoiceKit MCP connection...") |
|
|
session_stack = AsyncExitStack() |
|
|
|
|
|
try: |
|
|
voicekit_url = "https://mcp-1st-birthday-voicekit.hf.space/gradio_api/mcp/sse" |
|
|
read, write = await session_stack.enter_async_context(sse_client(voicekit_url)) |
|
|
voicekit_session = await session_stack.enter_async_context( |
|
|
ClientSession(read, write) |
|
|
) |
|
|
await voicekit_session.initialize() |
|
|
|
|
|
|
|
|
tools_result = await voicekit_session.list_tools() |
|
|
logger.info( |
|
|
f"β VoiceKit MCP connected. Tools: {[t.name for t in tools_result.tools]}" |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to initialize VoiceKit MCP: {e}") |
|
|
voicekit_session = None |
|
|
|
|
|
yield |
|
|
|
|
|
|
|
|
if session_stack: |
|
|
await session_stack.aclose() |
|
|
logger.info("β VoiceKit MCP connection closed") |
|
|
|
|
|
|
|
|
app = FastAPI(title="Komentle Voice API", lifespan=lifespan) |
|
|
|
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=["*"], |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
|
|
|
os.makedirs("images", exist_ok=True) |
|
|
app.mount("/images", StaticFiles(directory="images"), name="images") |
|
|
|
|
|
|
|
|
os.makedirs("reference_audio", exist_ok=True) |
|
|
app.mount("/reference_audio", StaticFiles(directory="reference_audio"), name="reference_audio") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reference_audio_cache = {} |
|
|
|
|
|
|
|
|
hint_cache = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_reference_audio_with_fallback(reference_audio_path: str, puzzle_number: Optional[int] = None) -> Optional[str]: |
|
|
""" |
|
|
Load reference audio with multi-format fallback and caching |
|
|
|
|
|
Args: |
|
|
reference_audio_path: Path from database (may be wrong extension) |
|
|
puzzle_number: Puzzle number for caching (optional) |
|
|
|
|
|
Returns: |
|
|
base64-encoded audio string or None if not found |
|
|
""" |
|
|
|
|
|
if puzzle_number is not None and puzzle_number in reference_audio_cache: |
|
|
logger.info(f"β Using cached reference audio for puzzle #{puzzle_number}") |
|
|
return reference_audio_cache[puzzle_number] |
|
|
|
|
|
if not reference_audio_path: |
|
|
return None |
|
|
|
|
|
|
|
|
base_path = Path(reference_audio_path.lstrip("/")) |
|
|
base_name = base_path.stem |
|
|
parent_dir = base_path.parent |
|
|
|
|
|
|
|
|
if base_path.exists(): |
|
|
with open(base_path, "rb") as f: |
|
|
audio_b64 = base64.b64encode(f.read()).decode("utf-8") |
|
|
if puzzle_number is not None: |
|
|
reference_audio_cache[puzzle_number] = audio_b64 |
|
|
logger.info(f"β Loaded reference audio: {base_path}") |
|
|
return audio_b64 |
|
|
|
|
|
|
|
|
for ext in ['.wav', '.mp3', '.m4a', '.ogg', '.flac']: |
|
|
alt_path = parent_dir / f"{base_name}{ext}" |
|
|
if alt_path.exists(): |
|
|
with open(alt_path, "rb") as f: |
|
|
audio_b64 = base64.b64encode(f.read()).decode("utf-8") |
|
|
if puzzle_number is not None: |
|
|
reference_audio_cache[puzzle_number] = audio_b64 |
|
|
logger.info(f"β Loaded reference audio (alternative format): {alt_path}") |
|
|
return audio_b64 |
|
|
|
|
|
logger.warning(f"β No reference audio found for: {reference_audio_path}") |
|
|
return None |
|
|
|
|
|
|
|
|
def get_hint_cache_key(attempt: int, scores: dict, category: str) -> str: |
|
|
""" |
|
|
Generate cache key for Gemini hints based on attempt and score buckets |
|
|
|
|
|
Args: |
|
|
attempt: Attempt number (1-6) |
|
|
scores: Dict of scores {pitch, rhythm, energy, pronunciation, transcript} |
|
|
category: Puzzle category |
|
|
|
|
|
Returns: |
|
|
Cache key string |
|
|
""" |
|
|
def bucket(score): |
|
|
"""Bucket scores into low/medium/high""" |
|
|
if score < 30: |
|
|
return "low" |
|
|
elif score < 70: |
|
|
return "med" |
|
|
else: |
|
|
return "high" |
|
|
|
|
|
|
|
|
numeric_scores = {k: v for k, v in scores.items() if isinstance(v, (int, float))} |
|
|
|
|
|
|
|
|
weakest = sorted(numeric_scores.items(), key=lambda x: x[1])[:2] |
|
|
weakest_str = "_".join([f"{k}:{bucket(v)}" for k, v in weakest]) |
|
|
|
|
|
return f"{category}_attempt{attempt}_{weakest_str}" |
|
|
|
|
|
|
|
|
|
|
|
class AnalysisResponse(BaseModel): |
|
|
status: str |
|
|
category: str |
|
|
answer_word: Optional[str] = None |
|
|
reference_audio_path: Optional[str] = None |
|
|
pitch: float |
|
|
rhythm: float |
|
|
energy: float |
|
|
pronunciation: float |
|
|
transcript: float |
|
|
overall: float |
|
|
advice: Optional[str] = None |
|
|
is_correct: bool = False |
|
|
message: Optional[str] = None |
|
|
user_text: Optional[str] = None |
|
|
|
|
|
|
|
|
class ErrorResponse(BaseModel): |
|
|
status: str |
|
|
message: str |
|
|
|
|
|
|
|
|
def get_puzzle_by_date(date: str) -> Optional[Dict]: |
|
|
""" |
|
|
Query puzzle by date from database |
|
|
|
|
|
Args: |
|
|
date: Date string in YYYY-MM-DD format |
|
|
|
|
|
Returns: |
|
|
dict: Puzzle info or None if not found |
|
|
""" |
|
|
try: |
|
|
query = text( |
|
|
""" |
|
|
SELECT puzzle_number, answer_word, puzzle_date, difficulty, category, reference_audio_path |
|
|
FROM puzzles |
|
|
WHERE puzzle_date = :date |
|
|
LIMIT 1 |
|
|
""" |
|
|
) |
|
|
|
|
|
with engine.connect() as connection: |
|
|
result = connection.execute(query, {"date": date}) |
|
|
row = result.fetchone() |
|
|
|
|
|
if row: |
|
|
return { |
|
|
"puzzle_number": row[0], |
|
|
"answer_word": row[1], |
|
|
"puzzle_date": str(row[2]), |
|
|
"difficulty": row[3], |
|
|
"category": row[4], |
|
|
"reference_audio_path": row[5], |
|
|
} |
|
|
return None |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Database error: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def get_attempt_count(session_id: str) -> int: |
|
|
"""Track and increment attempt count for session""" |
|
|
global session_attempts |
|
|
if session_id not in session_attempts: |
|
|
session_attempts[session_id] = 0 |
|
|
session_attempts[session_id] += 1 |
|
|
return session_attempts[session_id] |
|
|
|
|
|
|
|
|
def list_hint_files(category: str) -> list: |
|
|
"""List available hint images for category""" |
|
|
hints_dir = Path("images/hints") / category |
|
|
if not hints_dir.exists(): |
|
|
return [] |
|
|
extensions = ["*.jpg", "*.png", "*.jpeg", "*.gif"] |
|
|
files = [] |
|
|
for ext in extensions: |
|
|
files.extend([f.name for f in hints_dir.glob(ext)]) |
|
|
return files |
|
|
|
|
|
|
|
|
async def generate_hints_with_gemini( |
|
|
scores: dict, attempt: int, answer_word: str, category: str |
|
|
) -> dict: |
|
|
"""Generate JSON hints using Gemini LLM (caching disabled to ensure unique hints)""" |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metrics = { |
|
|
k: v |
|
|
for k, v in scores.items() |
|
|
if k in ["pitch", "rhythm", "energy", "pronunciation"] |
|
|
} |
|
|
weakest = sorted(metrics.items(), key=lambda x: x[1])[:2] |
|
|
weakest_names = [w[0] for w in weakest] |
|
|
|
|
|
|
|
|
available_hints = list_hint_files(category) |
|
|
hint_files_str = ( |
|
|
", ".join(available_hints[:5]) if available_hints else "none available" |
|
|
) |
|
|
|
|
|
|
|
|
if attempt == 1: |
|
|
hint_type = "hint" |
|
|
guidance = f"Give an EXTREMELY VAGUE clue. Don't mention the category yet. Just hint at the general concept." |
|
|
category_hint = "Do NOT mention the category on first attempt." |
|
|
elif attempt == 2: |
|
|
hint_type = "hint" |
|
|
guidance = f"Give a VAGUE clue and casually mention it's a {category}. Include an image hint if available." |
|
|
category_hint = f"Mention it's a {category} but keep the clue vague." |
|
|
elif attempt <= 4: |
|
|
hint_type = "hint" |
|
|
guidance = f"Give a MORE SPECIFIC clue about this {category}. Include relevant context. Use image if available." |
|
|
category_hint = f"Be clear this is a {category} and add more context." |
|
|
elif attempt <= 6: |
|
|
hint_type = "hint" |
|
|
guidance = f"Give a QUITE SPECIFIC hint about this {category}. Can mention era, context, or usage. Include image if helpful." |
|
|
category_hint = f"Give substantial clues while still not revealing the answer." |
|
|
elif attempt <= 10: |
|
|
hint_type = "hint" |
|
|
guidance = f"Give VERY SPECIFIC hints. Can mention syllable count, rhymes, or first letter. This is attempt {attempt} - be helpful!" |
|
|
category_hint = f"User has tried {attempt} times. Give strong hints without saying the answer." |
|
|
else: |
|
|
hint_type = "advice" |
|
|
guidance = f"Attempt {attempt}! Focus on pronunciation coaching for {', '.join(weakest_names)}. Give very strong hints about what to say." |
|
|
category_hint = f"After {attempt} attempts, be very helpful while still not directly revealing the answer." |
|
|
|
|
|
|
|
|
prompt = f"""You are a hint generator for "Audio Semantle" - a pronunciation puzzle game where players start blind and must figure out what word to say. |
|
|
|
|
|
**Current State:** |
|
|
- Answer word: "{answer_word}" (DO NOT reveal this directly!) |
|
|
- Category: {category} (this is a {category}) |
|
|
- Attempt number: {attempt} (players have UNLIMITED attempts) |
|
|
- Scores (0-100): Pitch={scores.get('pitch', 0)}, Rhythm={scores.get('rhythm', 0)}, Energy={scores.get('energy', 0)}, Pronunciation={scores.get('pronunciation', 0)}, Overall={scores.get('overall', 0)} |
|
|
- Weakest areas: {', '.join(weakest_names)} |
|
|
- Available hint images: {hint_files_str} |
|
|
|
|
|
**Task:** {guidance} |
|
|
**Category Guidance:** {category_hint} |
|
|
|
|
|
**Hint Examples by Category:** |
|
|
- If category = "meme": "This viral phrase often appears in funny internet videos..." |
|
|
- If category = "movie": "This famous movie quote/title was released in..." |
|
|
- If category = "song": "This classic song by [artist hint] topped the charts..." |
|
|
|
|
|
**Return ONLY this JSON format, no other text:** |
|
|
{{ |
|
|
"type": "{hint_type}", |
|
|
"answer": [ |
|
|
{{ |
|
|
"text": "Your hint or advice text here (can mention category)", |
|
|
"path": "images/hints/{category}/filename.jpg" OR "" |
|
|
}} |
|
|
] |
|
|
}} |
|
|
|
|
|
**Rules for Progressive Hints:** |
|
|
1. Remember: Players start COMPLETELY BLIND - they don't know what to say initially |
|
|
2. Hints should get progressively more helpful with each attempt |
|
|
3. For "hint" type: Follow the guidance above based on attempt number |
|
|
4. For "advice" type: Focus on pronunciation + give strong contextual clues |
|
|
5. Keep text concise (1-2 sentences max) |
|
|
6. NEVER reveal the answer directly, but after 10+ attempts be very helpful |
|
|
7. Return ONLY valid JSON, no markdown, no extra text |
|
|
""" |
|
|
|
|
|
|
|
|
response = call_gemini_with_tools( |
|
|
model_name="gemini-2.5-flash", |
|
|
system_prompt="You are a JSON generator. Return ONLY valid JSON with no markdown formatting or extra text.", |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
tools=[], |
|
|
max_tokens=512, |
|
|
) |
|
|
|
|
|
|
|
|
response_text, error = get_text_from_gemini_response(response) |
|
|
if error: |
|
|
logger.error(f"Gemini response error: {error}") |
|
|
|
|
|
return { |
|
|
"type": "advice", |
|
|
"answer": [ |
|
|
{ |
|
|
"text": f"Focus on improving {weakest_names[0]} (score: {weakest[0][1]:.0f}/100)", |
|
|
"path": "", |
|
|
} |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
response_text = response_text.strip() |
|
|
if response_text.startswith("```"): |
|
|
lines = response_text.split("\n") |
|
|
response_text = "\n".join(lines[1:-1]) if len(lines) > 2 else response_text |
|
|
|
|
|
|
|
|
hints_json = json.loads(response_text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"β Generated fresh hint for attempt {attempt}, category {category}") |
|
|
|
|
|
return hints_json |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Hint generation error: {e}") |
|
|
|
|
|
return { |
|
|
"type": "advice", |
|
|
"answer": [ |
|
|
{ |
|
|
"text": "Keep practicing! Focus on your pronunciation.", |
|
|
"path": "", |
|
|
} |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
def extract_advice_text(hints_json: dict) -> str: |
|
|
"""Extract plain text from hints JSON for advice field""" |
|
|
try: |
|
|
return " ".join([item["text"] for item in hints_json.get("answer", [])]) |
|
|
except: |
|
|
return "Keep practicing!" |
|
|
|
|
|
|
|
|
async def call_ai_server( |
|
|
audio_file: bytes, |
|
|
session_id: str, |
|
|
category: str, |
|
|
answer_word: str, |
|
|
reference_audio_path: Optional[str] = None, |
|
|
puzzle_number: Optional[int] = None, |
|
|
) -> Dict: |
|
|
""" |
|
|
Analyze voice using VoiceKit MCP + Gemini for hints |
|
|
|
|
|
Args: |
|
|
audio_file: Audio file bytes |
|
|
session_id: User session ID |
|
|
category: Puzzle category (meme, movie, song) |
|
|
answer_word: Correct answer for this puzzle |
|
|
reference_audio_path: Path to reference audio file (from DB) |
|
|
puzzle_number: Puzzle number for caching reference audio |
|
|
|
|
|
Returns: |
|
|
dict: AI analysis results with pitch, rhythm, energy, pronounciation, transcript, overall_score, advice, hints, is_correct |
|
|
""" |
|
|
try: |
|
|
start_time = time.time() |
|
|
|
|
|
if not voicekit_session: |
|
|
logger.error("VoiceKit MCP not initialized") |
|
|
return {"error": "AI service not available"} |
|
|
|
|
|
|
|
|
compressed_audio = compress_audio(audio_file) |
|
|
|
|
|
|
|
|
user_b64 = base64.b64encode(compressed_audio).decode("utf-8") |
|
|
logger.info(f"β±οΈ Base64 encoding: {(time.time() - start_time)*1000:.1f}ms") |
|
|
|
|
|
|
|
|
ref_start = time.time() |
|
|
reference_b64 = load_reference_audio_with_fallback(reference_audio_path, puzzle_number) |
|
|
logger.info(f"β±οΈ Reference audio load: {(time.time() - ref_start)*1000:.1f}ms") |
|
|
|
|
|
if reference_b64 is None: |
|
|
|
|
|
reference_b64 = user_b64 |
|
|
logger.warning("β No reference audio available, using user audio") |
|
|
|
|
|
|
|
|
attempt = get_attempt_count(session_id) |
|
|
logger.info(f"Session {session_id}: Attempt {attempt}/6") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result = None |
|
|
|
|
|
|
|
|
if result is None: |
|
|
max_retries = 3 |
|
|
timeout_seconds = 20 |
|
|
voicekit_start = time.time() |
|
|
for retry in range(max_retries): |
|
|
try: |
|
|
logger.info(f"Calling VoiceKit MCP (attempt {retry + 1}/{max_retries})...") |
|
|
result = await asyncio.wait_for( |
|
|
voicekit_session.call_tool( |
|
|
"voicekit_analyze_voice_similarity", |
|
|
{ |
|
|
"user_audio_base64": user_b64, |
|
|
"reference_audio_base64": reference_b64, |
|
|
"reference_text": answer_word, |
|
|
"category": category, |
|
|
}, |
|
|
), |
|
|
timeout=timeout_seconds |
|
|
) |
|
|
voicekit_time = (time.time() - voicekit_start) * 1000 |
|
|
logger.info(f"β VoiceKit MCP call successful") |
|
|
logger.info(f"β±οΈ VoiceKit MCP call: {voicekit_time:.1f}ms") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"β Fresh VoiceKit analysis completed") |
|
|
|
|
|
break |
|
|
except asyncio.TimeoutError: |
|
|
if retry < max_retries - 1: |
|
|
|
|
|
retry_delay = 0.5 * (2 ** retry) |
|
|
logger.warning( |
|
|
f"VoiceKit call timed out after {timeout_seconds}s (attempt {retry + 1}/{max_retries}), retrying in {retry_delay}s" |
|
|
) |
|
|
await asyncio.sleep(retry_delay) |
|
|
else: |
|
|
logger.error( |
|
|
f"VoiceKit call timed out after {max_retries} attempts" |
|
|
) |
|
|
raise HTTPException(status_code=504, detail="VoiceKit service timeout") |
|
|
except Exception as e: |
|
|
error_msg = str(e) |
|
|
|
|
|
if "ClosedResourceError" in error_msg or "ClosedResourceError" in str(type(e)): |
|
|
logger.warning(f"MCP connection closed, attempting to reconnect...") |
|
|
try: |
|
|
await reconnect_voicekit_mcp() |
|
|
logger.info("MCP reconnected, retrying request...") |
|
|
await asyncio.sleep(1) |
|
|
continue |
|
|
except Exception as reconnect_error: |
|
|
logger.error(f"MCP reconnection failed: {reconnect_error}") |
|
|
|
|
|
if retry < max_retries - 1: |
|
|
|
|
|
retry_delay = 0.5 * (2 ** retry) |
|
|
logger.warning( |
|
|
f"VoiceKit call failed (attempt {retry + 1}/{max_retries}): {e}, retrying in {retry_delay}s" |
|
|
) |
|
|
await asyncio.sleep(retry_delay) |
|
|
else: |
|
|
logger.error( |
|
|
f"VoiceKit call failed after {max_retries} attempts: {e}" |
|
|
) |
|
|
raise |
|
|
|
|
|
|
|
|
scores_text = result.content[0].text |
|
|
scores = json.loads(scores_text) |
|
|
|
|
|
|
|
|
logger.info(f"VoiceKit scores: {scores}") |
|
|
|
|
|
print(f"\n{'='*50}") |
|
|
print(f"[AI RESPONSE] VoiceKit MCP μλ΅ λ°μ΄ν°:") |
|
|
print(f" Raw text: {scores_text}") |
|
|
print(f" Parsed scores:") |
|
|
for key, value in scores.items(): |
|
|
print(f" - {key}: {value}") |
|
|
print(f"{'='*50}\n") |
|
|
|
|
|
|
|
|
gemini_start = time.time() |
|
|
hints_json = await generate_hints_with_gemini( |
|
|
scores=scores, attempt=attempt, answer_word=answer_word, category=category |
|
|
) |
|
|
gemini_time = (time.time() - gemini_start) * 1000 |
|
|
logger.info(f"β±οΈ Gemini hint generation: {gemini_time:.1f}ms") |
|
|
logger.info(f"Generated hints: {hints_json}") |
|
|
|
|
|
print(f"\n{'='*50}") |
|
|
print(f"[AI RESPONSE] Gemini ννΈ μλ΅ λ°μ΄ν°:") |
|
|
print(f" {json.dumps(hints_json, ensure_ascii=False, indent=2)}") |
|
|
print(f"{'='*50}\n") |
|
|
|
|
|
|
|
|
total_time = (time.time() - start_time) * 1000 |
|
|
logger.info(f"β±οΈ TOTAL REQUEST TIME: {total_time:.1f}ms") |
|
|
|
|
|
|
|
|
return { |
|
|
"pitch": scores.get("pitch", 0) / 100.0, |
|
|
"rhythm": scores.get("rhythm", 0) / 100.0, |
|
|
"energy": scores.get("energy", 0) / 100.0, |
|
|
"pronounciation": scores.get("pronunciation", 0) |
|
|
/ 100.0, |
|
|
"transcript": scores.get("transcript", 0) / 100.0, |
|
|
"overall_score": scores.get("overall", 0) / 100.0, |
|
|
"advice": extract_advice_text(hints_json), |
|
|
"hints": hints_json, |
|
|
"is_correct": scores.get("overall", 0) > 85, |
|
|
"user_text": scores.get("user_text", ""), |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"AI analysis error: {e}") |
|
|
import traceback |
|
|
|
|
|
traceback.print_exc() |
|
|
return {"error": str(e)} |
|
|
|
|
|
|
|
|
def convert_to_percentage(value: float) -> float: |
|
|
""" |
|
|
Convert AI score (0.0-1.0) to percentage (0-100) |
|
|
|
|
|
Args: |
|
|
value: Score in 0.0-1.0 range |
|
|
|
|
|
Returns: |
|
|
float: Score in 0-100 range |
|
|
""" |
|
|
return round(value * 100, 1) |
|
|
|
|
|
|
|
|
def save_guess_record( |
|
|
session_id: str, |
|
|
puzzle_number: int, |
|
|
pitch: float, |
|
|
rhythm: float, |
|
|
energy: float, |
|
|
pronunciation: float, |
|
|
transcript: float, |
|
|
overall: float, |
|
|
advice: str, |
|
|
is_correct: bool, |
|
|
user_text: str = "", |
|
|
) -> bool: |
|
|
""" |
|
|
Save guess record to database |
|
|
|
|
|
Args: |
|
|
session_id: User session UUID |
|
|
puzzle_number: Puzzle number |
|
|
pitch: Pitch score (0-100) |
|
|
rhythm: Rhythm score (0-100) |
|
|
energy: Energy score (0-100) |
|
|
pronunciation: Pronunciation score (0-100) |
|
|
transcript: Transcript score (0-100) |
|
|
overall: Overall score (0-100) |
|
|
advice: AI advice |
|
|
is_correct: Whether answer is correct |
|
|
user_text: STT transcription from MCP |
|
|
|
|
|
Returns: |
|
|
bool: True if saved successfully, False otherwise |
|
|
""" |
|
|
try: |
|
|
|
|
|
guess_timestamp = int(time.time() * 1000) |
|
|
|
|
|
query = text( |
|
|
""" |
|
|
INSERT INTO guess_records |
|
|
(session_id, puzzle_number, pitch, rhythm, energy, pronunciation, |
|
|
transcript, overall, advice, is_correct, guess_timestamp, user_text) |
|
|
VALUES |
|
|
(:session_id, :puzzle_number, :pitch, :rhythm, :energy, :pronunciation, |
|
|
:transcript, :overall, :advice, :is_correct, :guess_timestamp, :user_text) |
|
|
""" |
|
|
) |
|
|
|
|
|
with engine.connect() as connection: |
|
|
connection.execute( |
|
|
query, |
|
|
{ |
|
|
"session_id": session_id, |
|
|
"puzzle_number": puzzle_number, |
|
|
"pitch": pitch, |
|
|
"rhythm": rhythm, |
|
|
"energy": energy, |
|
|
"pronunciation": pronunciation, |
|
|
"transcript": transcript, |
|
|
"overall": overall, |
|
|
"advice": advice, |
|
|
"is_correct": is_correct, |
|
|
"guess_timestamp": guess_timestamp, |
|
|
"user_text": user_text, |
|
|
}, |
|
|
) |
|
|
connection.commit() |
|
|
|
|
|
logger.info( |
|
|
f"Saved guess record: session={session_id}, puzzle={puzzle_number}, correct={is_correct}" |
|
|
) |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to save guess record: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
async def root(): |
|
|
"""Health check endpoint""" |
|
|
return {"status": "ok", "message": "Komentle Voice API"} |
|
|
|
|
|
|
|
|
@app.get("/health") |
|
|
async def health_check(): |
|
|
"""Detailed health check""" |
|
|
db_status = "ok" |
|
|
try: |
|
|
with engine.connect() as connection: |
|
|
connection.execute(text("SELECT 1")) |
|
|
except Exception as e: |
|
|
db_status = f"error: {str(e)}" |
|
|
|
|
|
return { |
|
|
"status": "ok", |
|
|
"database": db_status, |
|
|
"timestamp": datetime.now().isoformat(), |
|
|
} |
|
|
|
|
|
|
|
|
async def analyze_voice_logic(audio_bytes: bytes, date: str, session_id: str) -> Dict: |
|
|
""" |
|
|
Core logic for voice analysis (can be called directly or via API) |
|
|
|
|
|
Args: |
|
|
audio_bytes: Audio file bytes |
|
|
date: Date in YYYY-MM-DD format |
|
|
session_id: User session UUID |
|
|
|
|
|
Returns: |
|
|
dict: Analysis results with scores |
|
|
""" |
|
|
logger.info(f"Received request: date={date}, session_id={session_id}") |
|
|
|
|
|
|
|
|
puzzle = get_puzzle_by_date(date) |
|
|
if not puzzle: |
|
|
return {"status": "error", "message": f"No puzzle found for date: {date}"} |
|
|
|
|
|
logger.info(f"Found puzzle: {puzzle['puzzle_number']} - {puzzle['category']}") |
|
|
|
|
|
|
|
|
ai_response = await call_ai_server( |
|
|
audio_bytes, |
|
|
session_id, |
|
|
puzzle["category"], |
|
|
puzzle["answer_word"], |
|
|
puzzle.get("reference_audio_path"), |
|
|
puzzle["puzzle_number"], |
|
|
) |
|
|
|
|
|
if "error" in ai_response: |
|
|
return { |
|
|
"status": "error", |
|
|
"message": f"AI server error: {ai_response['error']}", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pitch = convert_to_percentage(ai_response.get("pitch", 0.0)) |
|
|
rhythm = convert_to_percentage(ai_response.get("rhythm", 0.0)) |
|
|
energy = convert_to_percentage(ai_response.get("energy", 0.0)) |
|
|
pronunciation = convert_to_percentage( |
|
|
ai_response.get("pronounciation", 0.0) |
|
|
) |
|
|
transcript = convert_to_percentage( |
|
|
ai_response.get("transcript", 0.0) |
|
|
) |
|
|
overall = convert_to_percentage(ai_response.get("overall_score", 0.0)) |
|
|
|
|
|
advice = ai_response.get("advice", "") |
|
|
is_correct = ai_response.get("is_correct", False) |
|
|
user_text = ai_response.get("user_text", "") |
|
|
|
|
|
|
|
|
save_guess_record( |
|
|
session_id=session_id, |
|
|
puzzle_number=puzzle["puzzle_number"], |
|
|
pitch=pitch, |
|
|
rhythm=rhythm, |
|
|
energy=energy, |
|
|
pronunciation=pronunciation, |
|
|
transcript=transcript, |
|
|
overall=overall, |
|
|
advice=advice, |
|
|
is_correct=is_correct, |
|
|
user_text=user_text, |
|
|
) |
|
|
|
|
|
logger.info( |
|
|
f"Analysis complete: category={puzzle['category']}, overall={overall}, correct={is_correct}" |
|
|
) |
|
|
|
|
|
result = { |
|
|
"status": "success", |
|
|
"category": puzzle["category"], |
|
|
"answer_word": puzzle["answer_word"], |
|
|
"reference_audio_path": puzzle.get("reference_audio_path"), |
|
|
"pitch": pitch, |
|
|
"rhythm": rhythm, |
|
|
"energy": energy, |
|
|
"pronunciation": pronunciation, |
|
|
"transcript": transcript, |
|
|
"overall": overall, |
|
|
"advice": advice, |
|
|
"is_correct": is_correct, |
|
|
"user_text": user_text, |
|
|
} |
|
|
|
|
|
print(f"\n{'='*50}") |
|
|
print(f"[SCORING RESULT] analyze_voice_logic 리ν΄κ°:") |
|
|
print(f" - status: {result['status']}") |
|
|
print(f" - category: {result['category']}") |
|
|
print(f" - pitch: {result['pitch']}") |
|
|
print(f" - rhythm: {result['rhythm']}") |
|
|
print(f" - energy: {result['energy']}") |
|
|
print(f" - pronunciation: {result['pronunciation']}") |
|
|
print(f" - transcript: {result['transcript']}") |
|
|
print(f" - overall: {result['overall']}") |
|
|
print(f" - is_correct: {result['is_correct']}") |
|
|
print(f" - user_text: {result['user_text']}") |
|
|
print(f" - advice: {result['advice'][:100]}..." if len(result['advice']) > 100 else f" - advice: {result['advice']}") |
|
|
print(f"{'='*50}\n") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
@app.post("/api/analyze-voice", response_model=AnalysisResponse) |
|
|
async def analyze_voice( |
|
|
audio: UploadFile = File(...), date: str = Form(...), session_id: str = Form(...) |
|
|
): |
|
|
""" |
|
|
Analyze user voice recording (API endpoint) |
|
|
|
|
|
Args: |
|
|
audio: Audio file (WAV format) |
|
|
date: Date in YYYY-MM-DD format |
|
|
session_id: User session UUID |
|
|
|
|
|
Returns: |
|
|
AnalysisResponse: Analysis results with scores |
|
|
""" |
|
|
|
|
|
audio_bytes = await audio.read() |
|
|
|
|
|
|
|
|
result = await analyze_voice_logic(audio_bytes, date, session_id) |
|
|
|
|
|
|
|
|
if result.get("status") == "error": |
|
|
raise HTTPException( |
|
|
status_code=500, detail=result.get("message", "Unknown error") |
|
|
) |
|
|
|
|
|
return AnalysisResponse(**result) |
|
|
|
|
|
|
|
|
@app.get("/api/puzzle/{date}") |
|
|
async def get_puzzle(date: str): |
|
|
""" |
|
|
Get puzzle information for a specific date |
|
|
|
|
|
Args: |
|
|
date: Date in YYYY-MM-DD format |
|
|
|
|
|
Returns: |
|
|
dict: Puzzle information |
|
|
""" |
|
|
puzzle = get_puzzle_by_date(date) |
|
|
if not puzzle: |
|
|
raise HTTPException(status_code=404, detail=f"No puzzle found for date: {date}") |
|
|
|
|
|
|
|
|
return { |
|
|
"puzzle_number": puzzle["puzzle_number"], |
|
|
"puzzle_date": puzzle["puzzle_date"], |
|
|
"difficulty": puzzle["difficulty"], |
|
|
"category": puzzle["category"], |
|
|
} |
|
|
|
|
|
|
|
|
@app.get("/api/dashboard") |
|
|
async def get_dashboard(): |
|
|
""" |
|
|
μ 체 λμ보λ λ°μ΄ν° μ‘°ν (μ€λ + μ 체 ν΅ν©) |
|
|
|
|
|
Returns: |
|
|
dict: Flat dashboard statistics with 6 key metrics |
|
|
""" |
|
|
try: |
|
|
today = datetime.now().strftime("%Y-%m-%d") |
|
|
today_answer = text( |
|
|
""" |
|
|
SELECT |
|
|
answer_word, |
|
|
reference_audio_path, |
|
|
category, |
|
|
difficulty, |
|
|
puzzle_date |
|
|
FROM puzzles |
|
|
WHERE puzzle_date = :today |
|
|
LIMIT 1 |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
today_query = text( |
|
|
""" |
|
|
SELECT |
|
|
puzzle_date, |
|
|
participants, |
|
|
success_rate, |
|
|
total_attempts |
|
|
FROM daily_statistics |
|
|
WHERE puzzle_date = :today |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
overall_query = text( |
|
|
""" |
|
|
SELECT |
|
|
total_participants, |
|
|
overall_success_rate, |
|
|
total_attempts, |
|
|
total_puzzles |
|
|
FROM overall_statistics |
|
|
""" |
|
|
) |
|
|
|
|
|
with engine.connect() as connection: |
|
|
|
|
|
answer_result = connection.execute(today_answer, {"today": today}) |
|
|
answer_row = answer_result.fetchone() |
|
|
|
|
|
if answer_row: |
|
|
answer_word = answer_row[0] |
|
|
reference_audio_path = answer_row[1] |
|
|
category = answer_row[2] |
|
|
difficulty = answer_row[3] |
|
|
else: |
|
|
answer_word = None |
|
|
reference_audio_path = None |
|
|
category = None |
|
|
difficulty = None |
|
|
|
|
|
|
|
|
today_result = connection.execute(today_query, {"today": today}) |
|
|
today_row = today_result.fetchone() |
|
|
|
|
|
if not today_row: |
|
|
today_participants = 0 |
|
|
today_success_rate = 0.0 |
|
|
today_attempts = 0 |
|
|
else: |
|
|
today_participants = today_row[1] |
|
|
today_success_rate = float(today_row[2]) |
|
|
today_attempts = today_row[3] |
|
|
|
|
|
|
|
|
overall_result = connection.execute(overall_query) |
|
|
overall_row = overall_result.fetchone() |
|
|
|
|
|
if not overall_row: |
|
|
total_participants = 0 |
|
|
total_success_rate = 0.0 |
|
|
total_attempts = 0 |
|
|
total_puzzles = 0 |
|
|
else: |
|
|
total_participants = overall_row[0] |
|
|
total_success_rate = float(overall_row[1]) |
|
|
total_attempts = overall_row[2] |
|
|
total_puzzles = overall_row[3] |
|
|
|
|
|
return { |
|
|
|
|
|
"today_participants": today_participants, |
|
|
"today_success_rate": today_success_rate, |
|
|
"today_attempts": today_attempts, |
|
|
|
|
|
"total_participants": total_participants, |
|
|
"total_success_rate": total_success_rate, |
|
|
"total_attempts": total_attempts, |
|
|
|
|
|
"answer_word": answer_word, |
|
|
"reference_audio_path": reference_audio_path, |
|
|
"category": category, |
|
|
"difficulty": difficulty, |
|
|
|
|
|
"date": today, |
|
|
"total_puzzles": total_puzzles, |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to get dashboard: {e}") |
|
|
raise HTTPException(status_code=500, detail="Failed to retrieve dashboard") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
|
|
|
host = os.getenv("SERVER_HOST") |
|
|
port = int(os.getenv("BACKEND_PORT")) |
|
|
uvicorn.run(app, host=host, port=port, log_level="info") |
|
|
|