Spaces:
Sleeping
Sleeping
Merge branch 'newbranchYB-newest' into Merged20250805
Browse files- evaluation/direct_llm_evaluator.py +419 -0
- evaluation/latency_evaluator.py +892 -0
- evaluation/metric1_latency_chart_generator.py +327 -0
- evaluation/metric2_extraction_chart_generator.py +216 -0
- evaluation/metric3_relevance_chart_generator.py +231 -0
- evaluation/metric4_coverage_chart_generator.py +222 -0
- evaluation/metric5_6_judge_evaluator_manual.md +303 -0
- evaluation/metric5_6_llm_judge_chart_generator.py +430 -0
- evaluation/metric5_6_llm_judge_evaluator.py +643 -0
- evaluation/metric7_8_precision_MRR.py +402 -0
- evaluation/metric7_8_precision_mrr_chart_generator.py +586 -0
- evaluation/old/coverage_evaluator.py +560 -0
- evaluation/{evaluation_instruction.md → old/evaluation_instruction.md} +455 -36
- evaluation/{evaluation_instruction_customization.md → old/evaluation_instruction_customization.md} +0 -0
- evaluation/old/extraction_evaluator.py +379 -0
- evaluation/old/relevance_evaluator.py +447 -0
- evaluation/pre_user_query_evaluate.txt +5 -0
- evaluation/single_test_query.txt +1 -0
- evaluation/user_query.txt +7 -7
- src/generation.py +6 -6
- src/llm_clients.py +241 -8
- src/medical_conditions.py +8 -0
- src/user_prompt.py +6 -4
evaluation/direct_llm_evaluator.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Direct LLM Evaluator (Med42-70B Only)
|
| 4 |
+
========================================================
|
| 5 |
+
|
| 6 |
+
Tests Med42-70B directly without RAG pipeline.
|
| 7 |
+
Only applicable metrics: 1 (Latency), 5 (Actionability), 6 (Evidence Quality)
|
| 8 |
+
|
| 9 |
+
Metrics 2-4 (Extraction, Relevance, Coverage) are not applicable for direct LLM.
|
| 10 |
+
|
| 11 |
+
Author: YanBo Chen
|
| 12 |
+
Date: 2025-08-04
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import time
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
from typing import Dict, List, Any
|
| 20 |
+
from datetime import datetime
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
import re
|
| 23 |
+
|
| 24 |
+
# Add project path
|
| 25 |
+
current_dir = Path(__file__).parent
|
| 26 |
+
project_root = current_dir.parent
|
| 27 |
+
src_dir = project_root / "src"
|
| 28 |
+
sys.path.insert(0, str(src_dir))
|
| 29 |
+
|
| 30 |
+
# Import LLM client only (no retrieval system needed)
|
| 31 |
+
try:
|
| 32 |
+
from llm_clients import llm_Med42_70BClient
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
print(f"❌ Import failed: {e}")
|
| 35 |
+
print("Please ensure running from project root directory")
|
| 36 |
+
sys.exit(1)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class DirectLLMEvaluator:
|
| 40 |
+
"""Direct LLM evaluation without RAG pipeline"""
|
| 41 |
+
|
| 42 |
+
def __init__(self):
|
| 43 |
+
"""Initialize direct LLM client only"""
|
| 44 |
+
print("🔧 Initializing Direct LLM Evaluator...")
|
| 45 |
+
|
| 46 |
+
# Initialize only LLM client (no retrieval, no user_prompt processing)
|
| 47 |
+
self.llm_client = llm_Med42_70BClient()
|
| 48 |
+
|
| 49 |
+
# Results accumulation
|
| 50 |
+
self.direct_results = []
|
| 51 |
+
self.medical_outputs = []
|
| 52 |
+
|
| 53 |
+
print("✅ Direct LLM Evaluator initialization complete")
|
| 54 |
+
|
| 55 |
+
def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
| 56 |
+
"""
|
| 57 |
+
Direct LLM evaluation for single query
|
| 58 |
+
|
| 59 |
+
Only tests direct LLM response without RAG pipeline
|
| 60 |
+
Applicable metrics: 1 (Latency), 5-6 (via medical output)
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
query: Medical query to test
|
| 64 |
+
category: Query category (diagnosis/treatment/mixed)
|
| 65 |
+
"""
|
| 66 |
+
print(f"🔍 Direct LLM evaluation: {query[:50]}...")
|
| 67 |
+
print(f"📋 Category: {category}")
|
| 68 |
+
|
| 69 |
+
overall_start = time.time()
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
# Direct LLM call without any RAG processing
|
| 73 |
+
llm_start = time.time()
|
| 74 |
+
|
| 75 |
+
# Create direct medical consultation prompt
|
| 76 |
+
direct_prompt = f"""
|
| 77 |
+
You are a medical expert providing clinical guidance.
|
| 78 |
+
|
| 79 |
+
Patient Query: {query}
|
| 80 |
+
|
| 81 |
+
Please provide comprehensive medical advice including:
|
| 82 |
+
1. Differential diagnosis (if applicable)
|
| 83 |
+
2. Immediate assessment steps
|
| 84 |
+
3. Treatment recommendations
|
| 85 |
+
4. Clinical considerations
|
| 86 |
+
|
| 87 |
+
Provide evidence-based, actionable medical guidance.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
# Direct LLM generation (same parameters as RAG system for fair comparison)
|
| 91 |
+
response = self.llm_client.analyze_medical_query(
|
| 92 |
+
query=direct_prompt,
|
| 93 |
+
max_tokens=1600, # Same as RAG system primary setting
|
| 94 |
+
timeout=60.0 # Increased timeout for stable evaluation
|
| 95 |
+
)
|
| 96 |
+
# Extract medical advice from response (Med42 client returns dict with 'raw_response')
|
| 97 |
+
if isinstance(response, dict):
|
| 98 |
+
medical_advice = response.get('raw_response', '') or response.get('content', '')
|
| 99 |
+
else:
|
| 100 |
+
medical_advice = str(response)
|
| 101 |
+
|
| 102 |
+
llm_time = time.time() - llm_start
|
| 103 |
+
total_time = time.time() - overall_start
|
| 104 |
+
|
| 105 |
+
# Check if response is valid (not empty) - focus on content, not timeout
|
| 106 |
+
if not medical_advice or len(medical_advice.strip()) == 0:
|
| 107 |
+
print(f"❌ Direct LLM returned empty response after {total_time:.2f}s")
|
| 108 |
+
raise ValueError("Empty response from LLM - no content generated")
|
| 109 |
+
|
| 110 |
+
# Create result
|
| 111 |
+
result = {
|
| 112 |
+
"query": query,
|
| 113 |
+
"category": category,
|
| 114 |
+
|
| 115 |
+
# Metric 1: Total Latency (direct LLM call time)
|
| 116 |
+
"latency_metrics": {
|
| 117 |
+
"total_latency": total_time,
|
| 118 |
+
"llm_generation_time": llm_time,
|
| 119 |
+
"meets_target": total_time <= 60.0
|
| 120 |
+
},
|
| 121 |
+
|
| 122 |
+
# Metrics 2-4: Not applicable for direct LLM
|
| 123 |
+
"extraction_metrics": {
|
| 124 |
+
"not_applicable": True,
|
| 125 |
+
"reason": "No extraction pipeline in direct LLM"
|
| 126 |
+
},
|
| 127 |
+
"relevance_metrics": {
|
| 128 |
+
"not_applicable": True,
|
| 129 |
+
"reason": "No retrieval pipeline in direct LLM"
|
| 130 |
+
},
|
| 131 |
+
"coverage_metrics": {
|
| 132 |
+
"not_applicable": True,
|
| 133 |
+
"reason": "No retrieval content to cover"
|
| 134 |
+
},
|
| 135 |
+
|
| 136 |
+
# Medical advice for metrics 5-6 evaluation
|
| 137 |
+
"medical_advice": medical_advice,
|
| 138 |
+
"advice_length": len(medical_advice),
|
| 139 |
+
|
| 140 |
+
"overall_success": True,
|
| 141 |
+
"model_type": "Med42-70B_direct",
|
| 142 |
+
"timestamp": datetime.now().isoformat()
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# Store result
|
| 146 |
+
self.direct_results.append(result)
|
| 147 |
+
|
| 148 |
+
# Store medical output for LLM judge evaluation
|
| 149 |
+
medical_output = {
|
| 150 |
+
"query": query,
|
| 151 |
+
"category": category,
|
| 152 |
+
"medical_advice": medical_advice,
|
| 153 |
+
"query_id": f"{category}_query_direct",
|
| 154 |
+
"model_type": "Med42-70B_direct",
|
| 155 |
+
"processing_time": total_time,
|
| 156 |
+
"timestamp": datetime.now().isoformat()
|
| 157 |
+
}
|
| 158 |
+
self.medical_outputs.append(medical_output)
|
| 159 |
+
|
| 160 |
+
print(f"✅ Direct LLM completed in {total_time:.2f}s")
|
| 161 |
+
print(f"📝 Generated advice: {len(medical_advice)} characters")
|
| 162 |
+
|
| 163 |
+
return result
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
total_time = time.time() - overall_start
|
| 167 |
+
print(f"❌ Direct LLM evaluation failed after {total_time:.2f}s: {e}")
|
| 168 |
+
|
| 169 |
+
error_result = {
|
| 170 |
+
"query": query,
|
| 171 |
+
"category": category,
|
| 172 |
+
"latency_metrics": {
|
| 173 |
+
"total_latency": total_time,
|
| 174 |
+
"meets_target": False
|
| 175 |
+
},
|
| 176 |
+
"overall_success": False,
|
| 177 |
+
"error": str(e),
|
| 178 |
+
"model_type": "Med42-70B_direct",
|
| 179 |
+
"timestamp": datetime.now().isoformat()
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
self.direct_results.append(error_result)
|
| 183 |
+
|
| 184 |
+
# Do NOT add failed queries to medical_outputs for judge evaluation
|
| 185 |
+
# Only successful queries with valid medical advice should be evaluated
|
| 186 |
+
|
| 187 |
+
return error_result
|
| 188 |
+
|
| 189 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
| 190 |
+
"""Parse queries from file with category labels"""
|
| 191 |
+
print(f"📁 Reading queries from file: {filepath}")
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 195 |
+
content = f.read()
|
| 196 |
+
|
| 197 |
+
queries_by_category = {
|
| 198 |
+
"diagnosis": [],
|
| 199 |
+
"treatment": [],
|
| 200 |
+
"mixed": []
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
lines = content.strip().split('\n')
|
| 204 |
+
|
| 205 |
+
for line in lines:
|
| 206 |
+
line = line.strip()
|
| 207 |
+
if not line:
|
| 208 |
+
continue
|
| 209 |
+
|
| 210 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
| 211 |
+
if match:
|
| 212 |
+
category_raw = match.group(1).lower()
|
| 213 |
+
query_text = match.group(2).strip()
|
| 214 |
+
|
| 215 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
| 216 |
+
category = 'mixed'
|
| 217 |
+
else:
|
| 218 |
+
category = category_raw
|
| 219 |
+
|
| 220 |
+
if category in queries_by_category and len(query_text) > 15:
|
| 221 |
+
queries_by_category[category].append({
|
| 222 |
+
"text": query_text,
|
| 223 |
+
"category": category
|
| 224 |
+
})
|
| 225 |
+
|
| 226 |
+
print(f"📋 Parsed queries by category:")
|
| 227 |
+
for category, category_queries in queries_by_category.items():
|
| 228 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
| 229 |
+
|
| 230 |
+
return queries_by_category
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(f"❌ Failed to read file: {e}")
|
| 234 |
+
return {"error": f"Failed to read file: {e}"}
|
| 235 |
+
|
| 236 |
+
def calculate_direct_llm_statistics(self) -> Dict[str, Any]:
|
| 237 |
+
"""Calculate statistics for direct LLM evaluation"""
|
| 238 |
+
successful_results = [r for r in self.direct_results if r.get('overall_success')]
|
| 239 |
+
|
| 240 |
+
if successful_results:
|
| 241 |
+
latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
|
| 242 |
+
|
| 243 |
+
# Category-wise statistics
|
| 244 |
+
category_stats = {}
|
| 245 |
+
results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
|
| 246 |
+
|
| 247 |
+
for result in successful_results:
|
| 248 |
+
category = result.get('category', 'unknown')
|
| 249 |
+
if category in results_by_category:
|
| 250 |
+
results_by_category[category].append(result)
|
| 251 |
+
|
| 252 |
+
for category, results in results_by_category.items():
|
| 253 |
+
if results:
|
| 254 |
+
cat_latencies = [r['latency_metrics']['total_latency'] for r in results]
|
| 255 |
+
category_stats[category] = {
|
| 256 |
+
"average_latency": sum(cat_latencies) / len(cat_latencies),
|
| 257 |
+
"query_count": len(cat_latencies),
|
| 258 |
+
"target_compliance": sum(1 for lat in cat_latencies if lat <= 60.0) / len(cat_latencies)
|
| 259 |
+
}
|
| 260 |
+
else:
|
| 261 |
+
category_stats[category] = {
|
| 262 |
+
"average_latency": 0.0,
|
| 263 |
+
"query_count": 0,
|
| 264 |
+
"target_compliance": 0.0
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
# Overall statistics
|
| 268 |
+
overall_stats = {
|
| 269 |
+
"average_latency": sum(latencies) / len(latencies),
|
| 270 |
+
"min_latency": min(latencies),
|
| 271 |
+
"max_latency": max(latencies),
|
| 272 |
+
"successful_queries": len(successful_results),
|
| 273 |
+
"total_queries": len(self.direct_results),
|
| 274 |
+
"success_rate": len(successful_results) / len(self.direct_results),
|
| 275 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
|
| 276 |
+
}
|
| 277 |
+
else:
|
| 278 |
+
category_stats = {cat: {"average_latency": 0.0, "query_count": 0, "target_compliance": 0.0}
|
| 279 |
+
for cat in ["diagnosis", "treatment", "mixed"]}
|
| 280 |
+
overall_stats = {
|
| 281 |
+
"average_latency": 0.0,
|
| 282 |
+
"successful_queries": 0,
|
| 283 |
+
"total_queries": len(self.direct_results),
|
| 284 |
+
"success_rate": 0.0,
|
| 285 |
+
"target_compliance": 0.0
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
return {
|
| 289 |
+
"category_results": category_stats,
|
| 290 |
+
"overall_results": overall_stats,
|
| 291 |
+
"model_type": "Med42-70B_direct",
|
| 292 |
+
"timestamp": datetime.now().isoformat()
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
def save_direct_llm_statistics(self, filename: str = None) -> str:
|
| 296 |
+
"""Save direct LLM statistics"""
|
| 297 |
+
stats = self.calculate_direct_llm_statistics()
|
| 298 |
+
|
| 299 |
+
if filename is None:
|
| 300 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 301 |
+
filename = f"direct_llm_statistics_{timestamp}.json"
|
| 302 |
+
|
| 303 |
+
results_dir = Path(__file__).parent / "results"
|
| 304 |
+
results_dir.mkdir(exist_ok=True)
|
| 305 |
+
filepath = results_dir / filename
|
| 306 |
+
|
| 307 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 308 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 309 |
+
|
| 310 |
+
print(f"📊 Direct LLM statistics saved to: {filepath}")
|
| 311 |
+
return str(filepath)
|
| 312 |
+
|
| 313 |
+
def save_direct_medical_outputs(self, filename: str = None) -> str:
|
| 314 |
+
"""Save medical outputs for LLM judge evaluation"""
|
| 315 |
+
if filename is None:
|
| 316 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 317 |
+
filename = f"medical_outputs_direct_{timestamp}.json"
|
| 318 |
+
|
| 319 |
+
results_dir = Path(__file__).parent / "results"
|
| 320 |
+
results_dir.mkdir(exist_ok=True)
|
| 321 |
+
filepath = results_dir / filename
|
| 322 |
+
|
| 323 |
+
output_data = {
|
| 324 |
+
"evaluation_metadata": {
|
| 325 |
+
"total_outputs": len(self.medical_outputs),
|
| 326 |
+
"categories": list(set(output['category'] for output in self.medical_outputs)),
|
| 327 |
+
"timestamp": datetime.now().isoformat(),
|
| 328 |
+
"model_type": "Med42-70B_direct"
|
| 329 |
+
},
|
| 330 |
+
"medical_outputs": self.medical_outputs
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 334 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
| 335 |
+
|
| 336 |
+
print(f"📝 Direct medical outputs saved to: {filepath}")
|
| 337 |
+
return str(filepath)
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
# Independent execution interface
|
| 341 |
+
if __name__ == "__main__":
|
| 342 |
+
"""Independent direct LLM evaluation interface"""
|
| 343 |
+
|
| 344 |
+
print("🚀 OnCall.ai Direct LLM Evaluator - Med42-70B Only")
|
| 345 |
+
|
| 346 |
+
if len(sys.argv) > 1:
|
| 347 |
+
query_file = sys.argv[1]
|
| 348 |
+
else:
|
| 349 |
+
# Default to evaluation/single_test_query.txt for consistency
|
| 350 |
+
query_file = Path(__file__).parent / "single_test_query.txt"
|
| 351 |
+
|
| 352 |
+
if not os.path.exists(query_file):
|
| 353 |
+
print(f"❌ Query file not found: {query_file}")
|
| 354 |
+
print("Usage: python direct_llm_evaluator.py [query_file.txt]")
|
| 355 |
+
sys.exit(1)
|
| 356 |
+
|
| 357 |
+
# Initialize evaluator
|
| 358 |
+
evaluator = DirectLLMEvaluator()
|
| 359 |
+
|
| 360 |
+
# Parse queries
|
| 361 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
| 362 |
+
|
| 363 |
+
if "error" in queries_by_category:
|
| 364 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
| 365 |
+
sys.exit(1)
|
| 366 |
+
|
| 367 |
+
# Test direct LLM for each query
|
| 368 |
+
print(f"\n🧪 Direct LLM Testing (No RAG Pipeline)")
|
| 369 |
+
|
| 370 |
+
for category, queries in queries_by_category.items():
|
| 371 |
+
if not queries:
|
| 372 |
+
continue
|
| 373 |
+
|
| 374 |
+
print(f"\n📂 Testing {category.upper()} with direct Med42-70B:")
|
| 375 |
+
|
| 376 |
+
for i, query_info in enumerate(queries):
|
| 377 |
+
query_text = query_info['text']
|
| 378 |
+
|
| 379 |
+
# Direct LLM evaluation
|
| 380 |
+
result = evaluator.evaluate_direct_llm_query(query_text, category)
|
| 381 |
+
|
| 382 |
+
# Pause between queries
|
| 383 |
+
if i < len(queries) - 1:
|
| 384 |
+
print(f" ⏳ Pausing 5s before next query...")
|
| 385 |
+
time.sleep(5)
|
| 386 |
+
|
| 387 |
+
# Pause between categories
|
| 388 |
+
if category != list(queries_by_category.keys())[-1]:
|
| 389 |
+
print(f"\n⏳ Pausing 10s before next category...")
|
| 390 |
+
time.sleep(10)
|
| 391 |
+
|
| 392 |
+
# Save results
|
| 393 |
+
print(f"\n📊 Generating direct LLM analysis...")
|
| 394 |
+
|
| 395 |
+
stats_path = evaluator.save_direct_llm_statistics()
|
| 396 |
+
outputs_path = evaluator.save_direct_medical_outputs()
|
| 397 |
+
|
| 398 |
+
# Print summary
|
| 399 |
+
stats = evaluator.calculate_direct_llm_statistics()
|
| 400 |
+
overall_results = stats['overall_results']
|
| 401 |
+
|
| 402 |
+
print(f"\n📊 === DIRECT LLM EVALUATION SUMMARY ===")
|
| 403 |
+
print(f"Overall Performance:")
|
| 404 |
+
print(f" Average Latency: {overall_results['average_latency']:.2f}s")
|
| 405 |
+
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
| 406 |
+
print(f" 60s Target Compliance: {overall_results['target_compliance']:.1%}")
|
| 407 |
+
|
| 408 |
+
print(f"\nApplicable Metrics:")
|
| 409 |
+
print(f" ✅ Metric 1 (Latency): Measured")
|
| 410 |
+
print(f" ❌ Metric 2 (Extraction): Not applicable - no extraction pipeline")
|
| 411 |
+
print(f" ❌ Metric 3 (Relevance): Not applicable - no retrieval pipeline")
|
| 412 |
+
print(f" ❌ Metric 4 (Coverage): Not applicable - no retrieval content")
|
| 413 |
+
print(f" 🔄 Metric 5 (Actionability): Requires LLM judge evaluation")
|
| 414 |
+
print(f" 🔄 Metric 6 (Evidence): Requires LLM judge evaluation")
|
| 415 |
+
|
| 416 |
+
print(f"\n✅ Direct LLM evaluation complete!")
|
| 417 |
+
print(f"📊 Statistics: {stats_path}")
|
| 418 |
+
print(f"📝 Medical Outputs: {outputs_path}")
|
| 419 |
+
print(f"\n💡 Next step: Run python metric5_6_llm_judge_evaluator.py rag,direct for metrics 5-6")
|
evaluation/latency_evaluator.py
ADDED
|
@@ -0,0 +1,892 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Comprehensive Evaluator (Metrics 1-8)
|
| 4 |
+
========================================================
|
| 5 |
+
|
| 6 |
+
Single execution to collect all metrics 1-4 data from app.py pipeline.
|
| 7 |
+
Generates foundation data for metrics 5-8 evaluation in downstream processors.
|
| 8 |
+
|
| 9 |
+
COMPLETE METRICS OVERVIEW:
|
| 10 |
+
|
| 11 |
+
PIPELINE PERFORMANCE METRICS (Collected by this evaluator):
|
| 12 |
+
1. Total Latency (總處理時長) - Complete pipeline processing time from query to response
|
| 13 |
+
2. Condition Extraction Success Rate (條件抽取成功率) - Success rate of user_prompt.py condition extraction
|
| 14 |
+
3. Retrieval Relevance (檢索相關性) - Average cosine similarity scores from retrieval.py results
|
| 15 |
+
4. Retrieval Coverage (檢索覆蓋率) - Medical keyword utilization rate between retrieved content and generated advice
|
| 16 |
+
|
| 17 |
+
LLM JUDGE METRICS (Processed by metric5_6_llm_judge_evaluator.py):
|
| 18 |
+
5. Clinical Actionability (臨床可操作性) - Third-party LLM evaluation of medical advice actionability (1-10 scale)
|
| 19 |
+
* Uses batch evaluation strategy with Llama3-70B as judge
|
| 20 |
+
* Measures: Can healthcare providers immediately act on this advice?
|
| 21 |
+
* Target threshold: ≥7.0/10 for acceptable actionability
|
| 22 |
+
|
| 23 |
+
6. Clinical Evidence Quality (臨床證據品質) - Third-party LLM evaluation of evidence-based quality (1-10 scale)
|
| 24 |
+
* Uses same batch evaluation call as metric 5 for efficiency
|
| 25 |
+
* Measures: Is the advice evidence-based and follows medical standards?
|
| 26 |
+
* Target threshold: ≥7.5/10 for acceptable evidence quality
|
| 27 |
+
|
| 28 |
+
RETRIEVAL PRECISION METRICS (Processed by metric7_8_precision_MRR.py):
|
| 29 |
+
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval results
|
| 30 |
+
* Uses adaptive threshold based on query complexity (0.15 for complex, 0.25 for simple queries)
|
| 31 |
+
* Query complexity determined by unique emergency keywords count (≥4 = complex)
|
| 32 |
+
* Measures: relevant_results / total_retrieved_results
|
| 33 |
+
|
| 34 |
+
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
| 35 |
+
* Uses same adaptive threshold as Precision@K
|
| 36 |
+
* Measures: 1 / rank_of_first_relevant_result (0 if no relevant results)
|
| 37 |
+
* Higher MRR indicates relevant results appear earlier in ranking
|
| 38 |
+
|
| 39 |
+
DATA FLOW ARCHITECTURE:
|
| 40 |
+
1. latency_evaluator.py → comprehensive_details_*.json (metrics 1-4 + pipeline data)
|
| 41 |
+
2. latency_evaluator.py → medical_outputs_*.json (medical advice for judge evaluation)
|
| 42 |
+
3. metric5_6_llm_judge_evaluator.py → judge_evaluation_*.json (metrics 5-6)
|
| 43 |
+
4. metric7_8_precision_MRR.py → precision_mrr_analysis_*.json (metrics 7-8)
|
| 44 |
+
|
| 45 |
+
Note: This evaluator focuses on metrics 1-4 collection. Metrics 5-8 require separate downstream evaluation.
|
| 46 |
+
|
| 47 |
+
Author: YanBo Chen
|
| 48 |
+
Date: 2025-08-04
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
import time
|
| 52 |
+
import json
|
| 53 |
+
import os
|
| 54 |
+
import sys
|
| 55 |
+
from typing import Dict, List, Any, Set
|
| 56 |
+
from datetime import datetime
|
| 57 |
+
from pathlib import Path
|
| 58 |
+
import re
|
| 59 |
+
|
| 60 |
+
# Add project path
|
| 61 |
+
current_dir = Path(__file__).parent
|
| 62 |
+
project_root = current_dir.parent
|
| 63 |
+
src_dir = project_root / "src"
|
| 64 |
+
sys.path.insert(0, str(src_dir))
|
| 65 |
+
|
| 66 |
+
# Import existing system components
|
| 67 |
+
try:
|
| 68 |
+
from user_prompt import UserPromptProcessor
|
| 69 |
+
from retrieval import BasicRetrievalSystem
|
| 70 |
+
from llm_clients import llm_Med42_70BClient
|
| 71 |
+
from generation import MedicalAdviceGenerator
|
| 72 |
+
except ImportError as e:
|
| 73 |
+
print(f"❌ Import failed: {e}")
|
| 74 |
+
print("Please ensure running from project root directory")
|
| 75 |
+
sys.exit(1)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class ComprehensiveEvaluator:
|
| 79 |
+
"""Comprehensive evaluator for metrics 1-4 - single execution approach"""
|
| 80 |
+
|
| 81 |
+
def __init__(self):
|
| 82 |
+
"""Initialize system components (identical to app.py)"""
|
| 83 |
+
print("🔧 Initializing Comprehensive Evaluator...")
|
| 84 |
+
|
| 85 |
+
# Initialize existing system components (same as app.py)
|
| 86 |
+
self.llm_client = llm_Med42_70BClient()
|
| 87 |
+
self.retrieval_system = BasicRetrievalSystem()
|
| 88 |
+
self.user_prompt_processor = UserPromptProcessor(
|
| 89 |
+
llm_client=self.llm_client,
|
| 90 |
+
retrieval_system=self.retrieval_system
|
| 91 |
+
)
|
| 92 |
+
self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
|
| 93 |
+
|
| 94 |
+
# Results accumulation for all metrics
|
| 95 |
+
self.comprehensive_results = []
|
| 96 |
+
self.medical_outputs = []
|
| 97 |
+
|
| 98 |
+
print("✅ Comprehensive Evaluator initialization complete")
|
| 99 |
+
|
| 100 |
+
def extract_medical_keywords(self, text: str) -> Set[str]:
|
| 101 |
+
"""Extract medical keywords for coverage analysis"""
|
| 102 |
+
if not text:
|
| 103 |
+
return set()
|
| 104 |
+
|
| 105 |
+
medical_keywords = set()
|
| 106 |
+
text_lower = text.lower()
|
| 107 |
+
|
| 108 |
+
# Medical terminology patterns
|
| 109 |
+
patterns = [
|
| 110 |
+
r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
|
| 111 |
+
r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
|
| 112 |
+
r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
|
| 113 |
+
r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
|
| 114 |
+
r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
|
| 115 |
+
r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
|
| 116 |
+
r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
|
| 117 |
+
r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
|
| 118 |
+
]
|
| 119 |
+
|
| 120 |
+
for pattern in patterns:
|
| 121 |
+
matches = re.findall(pattern, text_lower)
|
| 122 |
+
medical_keywords.update(match.strip() for match in matches)
|
| 123 |
+
|
| 124 |
+
# Additional common medical terms
|
| 125 |
+
common_medical_terms = [
|
| 126 |
+
'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
|
| 127 |
+
'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
|
| 128 |
+
'protocol', 'guideline', 'recommendation', 'risk', 'factor'
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
for term in common_medical_terms:
|
| 132 |
+
if term in text_lower:
|
| 133 |
+
medical_keywords.add(term)
|
| 134 |
+
|
| 135 |
+
# Filter out very short terms and common words
|
| 136 |
+
filtered_keywords = {
|
| 137 |
+
kw for kw in medical_keywords
|
| 138 |
+
if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
return filtered_keywords
|
| 142 |
+
|
| 143 |
+
def calculate_coverage_metrics(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
|
| 144 |
+
"""Calculate coverage metrics from generated advice and retrieval results"""
|
| 145 |
+
if not generated_advice or not retrieval_results:
|
| 146 |
+
return {
|
| 147 |
+
"coverage_score": 0.0,
|
| 148 |
+
"matched_keywords": [],
|
| 149 |
+
"advice_keywords": [],
|
| 150 |
+
"source_keywords": [],
|
| 151 |
+
"coverage_percentage": 0.0,
|
| 152 |
+
"meets_threshold": False
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# Extract keywords from generated advice
|
| 156 |
+
advice_keywords = self.extract_medical_keywords(generated_advice)
|
| 157 |
+
|
| 158 |
+
# Extract keywords from all retrieved documents
|
| 159 |
+
all_source_keywords = set()
|
| 160 |
+
for doc in retrieval_results:
|
| 161 |
+
doc_content = doc.get('content', '') or doc.get('text', '')
|
| 162 |
+
doc_keywords = self.extract_medical_keywords(doc_content)
|
| 163 |
+
all_source_keywords.update(doc_keywords)
|
| 164 |
+
|
| 165 |
+
# Calculate coverage
|
| 166 |
+
matched_keywords = advice_keywords.intersection(all_source_keywords)
|
| 167 |
+
coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
|
| 168 |
+
|
| 169 |
+
return {
|
| 170 |
+
"coverage_score": coverage_score,
|
| 171 |
+
"matched_keywords": list(matched_keywords),
|
| 172 |
+
"advice_keywords": list(advice_keywords),
|
| 173 |
+
"source_keywords": list(all_source_keywords),
|
| 174 |
+
"advice_keywords_count": len(advice_keywords),
|
| 175 |
+
"source_keywords_count": len(all_source_keywords),
|
| 176 |
+
"matched_keywords_count": len(matched_keywords),
|
| 177 |
+
"coverage_percentage": coverage_score * 100,
|
| 178 |
+
"meets_threshold": coverage_score >= 0.4
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
def evaluate_single_query_comprehensive(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
| 182 |
+
"""
|
| 183 |
+
Comprehensive evaluation for single query - collects all metrics 1-4 data
|
| 184 |
+
|
| 185 |
+
Replicates app.py's process_medical_query pipeline exactly
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
query: Medical query to test
|
| 189 |
+
category: Query category (diagnosis/treatment/mixed)
|
| 190 |
+
"""
|
| 191 |
+
print(f"🔍 Comprehensive evaluation: {query[:50]}...")
|
| 192 |
+
print(f"📋 Category: {category}")
|
| 193 |
+
|
| 194 |
+
overall_start = time.time()
|
| 195 |
+
timing_details = {}
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
# STEP 1: Query Processing and Condition Extraction (identical to app.py)
|
| 199 |
+
step1_start = time.time()
|
| 200 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
| 201 |
+
step1_time = time.time() - step1_start
|
| 202 |
+
timing_details['step1_condition_extraction'] = step1_time
|
| 203 |
+
|
| 204 |
+
print(f" Step 1 - Condition extraction: {step1_time:.3f}s")
|
| 205 |
+
print(f" Extracted condition: {condition_result.get('condition', 'None')}")
|
| 206 |
+
|
| 207 |
+
# Check if valid medical query
|
| 208 |
+
if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
|
| 209 |
+
total_time = time.time() - overall_start
|
| 210 |
+
return self._create_failed_result(query, category, total_time, timing_details,
|
| 211 |
+
"non_medical", condition_result)
|
| 212 |
+
|
| 213 |
+
# STEP 2: User Confirmation (simulate auto-confirmation)
|
| 214 |
+
step2_start = time.time()
|
| 215 |
+
confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
|
| 216 |
+
step2_time = time.time() - step2_start
|
| 217 |
+
timing_details['step2_confirmation'] = step2_time
|
| 218 |
+
|
| 219 |
+
if not condition_result.get('condition'):
|
| 220 |
+
total_time = time.time() - overall_start
|
| 221 |
+
return self._create_failed_result(query, category, total_time, timing_details,
|
| 222 |
+
"no_condition", condition_result)
|
| 223 |
+
|
| 224 |
+
# STEP 3: Medical Guidelines Retrieval (identical to app.py)
|
| 225 |
+
step3_start = time.time()
|
| 226 |
+
|
| 227 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
| 228 |
+
if not search_query:
|
| 229 |
+
search_query = condition_result.get('condition', query)
|
| 230 |
+
|
| 231 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
| 232 |
+
step3_time = time.time() - step3_start
|
| 233 |
+
timing_details['step3_retrieval'] = step3_time
|
| 234 |
+
|
| 235 |
+
processed_results = retrieval_results.get('processed_results', [])
|
| 236 |
+
print(f" Step 3 - Retrieval: {step3_time:.3f}s ({len(processed_results)} results)")
|
| 237 |
+
|
| 238 |
+
# STEP 4: Medical Advice Generation (identical to app.py)
|
| 239 |
+
step4_start = time.time()
|
| 240 |
+
|
| 241 |
+
intention = self._detect_query_intention(query)
|
| 242 |
+
medical_advice_result = self.medical_generator.generate_medical_advice(
|
| 243 |
+
user_query=query,
|
| 244 |
+
retrieval_results=retrieval_results,
|
| 245 |
+
intention=intention
|
| 246 |
+
)
|
| 247 |
+
step4_time = time.time() - step4_start
|
| 248 |
+
timing_details['step4_generation'] = step4_time
|
| 249 |
+
|
| 250 |
+
generated_advice = medical_advice_result.get('medical_advice', '')
|
| 251 |
+
confidence_score = medical_advice_result.get('confidence_score', 0.0)
|
| 252 |
+
|
| 253 |
+
print(f" Step 4 - Generation: {step4_time:.3f}s")
|
| 254 |
+
|
| 255 |
+
total_time = time.time() - overall_start
|
| 256 |
+
|
| 257 |
+
# METRIC 2: Condition Extraction Analysis
|
| 258 |
+
extraction_success = (
|
| 259 |
+
condition_result.get('condition') and
|
| 260 |
+
condition_result.get('condition') != "unknown" and
|
| 261 |
+
condition_result.get('query_status') not in ['invalid_query', 'non_medical']
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
extraction_metrics = {
|
| 265 |
+
"extraction_success": extraction_success,
|
| 266 |
+
"extracted_condition": condition_result.get('condition'),
|
| 267 |
+
"query_status": condition_result.get('query_status'),
|
| 268 |
+
"emergency_keywords": condition_result.get('emergency_keywords', []),
|
| 269 |
+
"treatment_keywords": condition_result.get('treatment_keywords', []),
|
| 270 |
+
"fallback_level": condition_result.get('fallback_level', 'unknown'),
|
| 271 |
+
"extraction_time": step1_time
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
# METRIC 3: Retrieval Relevance Analysis
|
| 275 |
+
if processed_results:
|
| 276 |
+
relevance_scores = []
|
| 277 |
+
for doc_result in processed_results:
|
| 278 |
+
# Get angular distance and convert to relevance using correct formula
|
| 279 |
+
distance = doc_result.get('distance', 1.0)
|
| 280 |
+
relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
|
| 281 |
+
relevance_scores.append(relevance)
|
| 282 |
+
|
| 283 |
+
average_relevance = sum(relevance_scores) / len(relevance_scores)
|
| 284 |
+
high_relevance_count = sum(1 for score in relevance_scores if score >= 0.85)
|
| 285 |
+
|
| 286 |
+
relevance_metrics = {
|
| 287 |
+
"average_relevance": average_relevance,
|
| 288 |
+
"max_relevance": max(relevance_scores),
|
| 289 |
+
"min_relevance": min(relevance_scores),
|
| 290 |
+
"relevance_scores": relevance_scores,
|
| 291 |
+
"high_relevance_count": high_relevance_count,
|
| 292 |
+
"high_relevance_ratio": high_relevance_count / len(relevance_scores),
|
| 293 |
+
"retrieved_count": len(processed_results),
|
| 294 |
+
"meets_threshold": average_relevance >= 0.85,
|
| 295 |
+
"retrieval_time": step3_time
|
| 296 |
+
}
|
| 297 |
+
else:
|
| 298 |
+
relevance_metrics = {
|
| 299 |
+
"average_relevance": 0.0,
|
| 300 |
+
"max_relevance": 0.0,
|
| 301 |
+
"min_relevance": 0.0,
|
| 302 |
+
"similarity_scores": [],
|
| 303 |
+
"high_relevance_count": 0,
|
| 304 |
+
"high_relevance_ratio": 0.0,
|
| 305 |
+
"retrieved_count": 0,
|
| 306 |
+
"meets_threshold": False,
|
| 307 |
+
"retrieval_time": step3_time
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
# METRIC 4: Retrieval Coverage Analysis
|
| 311 |
+
coverage_metrics = self.calculate_coverage_metrics(generated_advice, processed_results)
|
| 312 |
+
coverage_metrics["generation_time"] = step4_time
|
| 313 |
+
|
| 314 |
+
# Create comprehensive result
|
| 315 |
+
comprehensive_result = {
|
| 316 |
+
"query": query,
|
| 317 |
+
"category": category,
|
| 318 |
+
|
| 319 |
+
# Metric 1: Total Latency - Complete pipeline processing time
|
| 320 |
+
"latency_metrics": {
|
| 321 |
+
"total_latency": total_time,
|
| 322 |
+
"timing_details": timing_details,
|
| 323 |
+
"meets_target": total_time <= 60.0
|
| 324 |
+
},
|
| 325 |
+
|
| 326 |
+
# Metric 2: Condition Extraction - Success rate from user_prompt.py
|
| 327 |
+
"extraction_metrics": extraction_metrics,
|
| 328 |
+
|
| 329 |
+
# Metric 3: Retrieval Relevance - Cosine similarity from retrieval.py
|
| 330 |
+
"relevance_metrics": relevance_metrics,
|
| 331 |
+
|
| 332 |
+
# Metric 4: Retrieval Coverage - Advice utilization of retrieved content
|
| 333 |
+
"coverage_metrics": coverage_metrics,
|
| 334 |
+
|
| 335 |
+
# Complete pipeline data (for debugging and detailed analysis)
|
| 336 |
+
"pipeline_data": {
|
| 337 |
+
"condition_result": condition_result,
|
| 338 |
+
"retrieval_results": retrieval_results,
|
| 339 |
+
"medical_advice_result": medical_advice_result,
|
| 340 |
+
"search_query": search_query,
|
| 341 |
+
"intention": intention
|
| 342 |
+
},
|
| 343 |
+
|
| 344 |
+
"overall_success": True,
|
| 345 |
+
"timestamp": datetime.now().isoformat()
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
# Validate data completeness for metrics 7-8 analysis
|
| 349 |
+
ready = True
|
| 350 |
+
data = comprehensive_result.get('pipeline_data', {})
|
| 351 |
+
|
| 352 |
+
# 1. Check retrieval results completeness for precision/MRR calculation
|
| 353 |
+
retr = data.get('retrieval_results', {}).get('processed_results', [])
|
| 354 |
+
if not retr or 'distance' not in retr[0]:
|
| 355 |
+
ready = False
|
| 356 |
+
|
| 357 |
+
# 2. Check condition extraction completeness for complexity analysis
|
| 358 |
+
cond = data.get('condition_result', {}).get('condition')
|
| 359 |
+
if not cond:
|
| 360 |
+
ready = False
|
| 361 |
+
|
| 362 |
+
# 3. Check overall execution status
|
| 363 |
+
if not comprehensive_result.get('overall_success', False):
|
| 364 |
+
ready = False
|
| 365 |
+
|
| 366 |
+
# 4. Check retrieval timing data completeness
|
| 367 |
+
if 'retrieval_time' not in comprehensive_result.get('relevance_metrics', {}):
|
| 368 |
+
ready = False
|
| 369 |
+
|
| 370 |
+
# Set metrics 7-8 readiness flag for downstream precision/MRR analysis
|
| 371 |
+
comprehensive_result['precision_mrr_ready'] = ready
|
| 372 |
+
|
| 373 |
+
# Store result
|
| 374 |
+
self.comprehensive_results.append(comprehensive_result)
|
| 375 |
+
|
| 376 |
+
# Store medical output for model comparison
|
| 377 |
+
medical_output = {
|
| 378 |
+
"query": query,
|
| 379 |
+
"category": category,
|
| 380 |
+
"medical_advice": generated_advice,
|
| 381 |
+
"confidence_score": confidence_score,
|
| 382 |
+
"query_id": f"{category}_query",
|
| 383 |
+
"processing_time": total_time,
|
| 384 |
+
"timestamp": datetime.now().isoformat()
|
| 385 |
+
}
|
| 386 |
+
self.medical_outputs.append(medical_output)
|
| 387 |
+
|
| 388 |
+
print(f"✅ Comprehensive evaluation completed in {total_time:.2f}s")
|
| 389 |
+
print(f" 📊 Metrics: Latency={total_time:.2f}s, Extraction={'✅' if extraction_success else '❌'}, "
|
| 390 |
+
f"Relevance={average_relevance:.3f}, Coverage={coverage_metrics['coverage_score']:.3f}")
|
| 391 |
+
|
| 392 |
+
return comprehensive_result
|
| 393 |
+
|
| 394 |
+
except Exception as e:
|
| 395 |
+
total_time = time.time() - overall_start
|
| 396 |
+
print(f"❌ Comprehensive evaluation failed after {total_time:.2f}s: {e}")
|
| 397 |
+
|
| 398 |
+
return self._create_failed_result(query, category, total_time, timing_details, "error", None, str(e))
|
| 399 |
+
|
| 400 |
+
def _create_failed_result(self, query: str, category: str, total_time: float,
|
| 401 |
+
timing_details: Dict, status: str, condition_result: Dict = None,
|
| 402 |
+
error: str = None) -> Dict[str, Any]:
|
| 403 |
+
"""Create standardized failed result"""
|
| 404 |
+
failed_result = {
|
| 405 |
+
"query": query,
|
| 406 |
+
"category": category,
|
| 407 |
+
|
| 408 |
+
# Metric 1: Total Latency - Always measurable even on failure
|
| 409 |
+
"latency_metrics": {
|
| 410 |
+
"total_latency": total_time,
|
| 411 |
+
"timing_details": timing_details,
|
| 412 |
+
"meets_target": total_time <= 60.0
|
| 413 |
+
},
|
| 414 |
+
|
| 415 |
+
# Metric 2: Condition Extraction - Partial data may be available before failure
|
| 416 |
+
"extraction_metrics": {
|
| 417 |
+
"extraction_success": False,
|
| 418 |
+
"extracted_condition": condition_result.get('condition') if condition_result else None,
|
| 419 |
+
"query_status": condition_result.get('query_status') if condition_result else status,
|
| 420 |
+
"extraction_time": timing_details.get('step1_condition_extraction', 0.0)
|
| 421 |
+
},
|
| 422 |
+
|
| 423 |
+
# Metric 3: Retrieval Relevance - Failed due to pipeline failure
|
| 424 |
+
"relevance_metrics": {
|
| 425 |
+
"average_relevance": 0.0,
|
| 426 |
+
"retrieved_count": 0,
|
| 427 |
+
"meets_threshold": False,
|
| 428 |
+
"retrieval_time": timing_details.get('step3_retrieval', 0.0)
|
| 429 |
+
},
|
| 430 |
+
|
| 431 |
+
# Metric 4: Retrieval Coverage - Failed due to pipeline failure
|
| 432 |
+
"coverage_metrics": {
|
| 433 |
+
"coverage_score": 0.0,
|
| 434 |
+
"meets_threshold": False,
|
| 435 |
+
"generation_time": timing_details.get('step4_generation', 0.0)
|
| 436 |
+
},
|
| 437 |
+
|
| 438 |
+
# Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
|
| 439 |
+
# are collected by metric5_6_llm_judge_evaluator.py using medical_outputs
|
| 440 |
+
# Metrics 7-8 (Precision@K & MRR) are collected by metric7_8_precision_MRR.py
|
| 441 |
+
# using comprehensive_details pipeline data
|
| 442 |
+
|
| 443 |
+
"overall_success": False,
|
| 444 |
+
"status": status,
|
| 445 |
+
"error": error,
|
| 446 |
+
"timestamp": datetime.now().isoformat()
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
# For failed results, precision/MRR analysis data is not ready
|
| 450 |
+
failed_result['precision_mrr_ready'] = False
|
| 451 |
+
|
| 452 |
+
self.comprehensive_results.append(failed_result)
|
| 453 |
+
return failed_result
|
| 454 |
+
|
| 455 |
+
def _detect_query_intention(self, query: str) -> str:
|
| 456 |
+
"""Simplified query intention detection (from app.py)"""
|
| 457 |
+
query_lower = query.lower()
|
| 458 |
+
|
| 459 |
+
if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
|
| 460 |
+
return 'diagnosis'
|
| 461 |
+
elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
|
| 462 |
+
return 'treatment'
|
| 463 |
+
else:
|
| 464 |
+
return 'mixed'
|
| 465 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
| 466 |
+
"""Parse queries from file with category labels"""
|
| 467 |
+
print(f"📁 Reading queries from file: {filepath}")
|
| 468 |
+
|
| 469 |
+
try:
|
| 470 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 471 |
+
content = f.read()
|
| 472 |
+
|
| 473 |
+
# Parse queries with category labels
|
| 474 |
+
queries_by_category = {
|
| 475 |
+
"diagnosis": [],
|
| 476 |
+
"treatment": [],
|
| 477 |
+
"mixed": []
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
lines = content.strip().split('\n')
|
| 481 |
+
|
| 482 |
+
for line in lines:
|
| 483 |
+
line = line.strip()
|
| 484 |
+
if not line:
|
| 485 |
+
continue
|
| 486 |
+
|
| 487 |
+
# Parse format: "1.diagnosis: query text"
|
| 488 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
| 489 |
+
if match:
|
| 490 |
+
category_raw = match.group(1).lower()
|
| 491 |
+
query_text = match.group(2).strip()
|
| 492 |
+
|
| 493 |
+
# Normalize category name
|
| 494 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
| 495 |
+
category = 'mixed'
|
| 496 |
+
else:
|
| 497 |
+
category = category_raw
|
| 498 |
+
|
| 499 |
+
if category in queries_by_category and len(query_text) > 15:
|
| 500 |
+
queries_by_category[category].append({
|
| 501 |
+
"text": query_text,
|
| 502 |
+
"category": category
|
| 503 |
+
})
|
| 504 |
+
|
| 505 |
+
print(f"📋 Parsed queries by category:")
|
| 506 |
+
for category, category_queries in queries_by_category.items():
|
| 507 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
| 508 |
+
|
| 509 |
+
return queries_by_category
|
| 510 |
+
|
| 511 |
+
except Exception as e:
|
| 512 |
+
print(f"❌ Failed to read file: {e}")
|
| 513 |
+
return {"error": f"Failed to read file: {e}"}
|
| 514 |
+
|
| 515 |
+
def calculate_metric_statistics(self, metric_name: str) -> Dict[str, Any]:
|
| 516 |
+
"""Calculate statistics for a specific metric across all results"""
|
| 517 |
+
category_stats = {}
|
| 518 |
+
all_successful_results = []
|
| 519 |
+
|
| 520 |
+
# Group results by category
|
| 521 |
+
results_by_category = {
|
| 522 |
+
"diagnosis": [],
|
| 523 |
+
"treatment": [],
|
| 524 |
+
"mixed": []
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
for result in self.comprehensive_results:
|
| 528 |
+
category = result.get('category', 'unknown')
|
| 529 |
+
if category in results_by_category:
|
| 530 |
+
results_by_category[category].append(result)
|
| 531 |
+
if result.get('overall_success'):
|
| 532 |
+
all_successful_results.append(result)
|
| 533 |
+
|
| 534 |
+
# Calculate statistics for each category based on metric type
|
| 535 |
+
for category, results in results_by_category.items():
|
| 536 |
+
successful_results = [r for r in results if r.get('overall_success')]
|
| 537 |
+
|
| 538 |
+
if metric_name == "latency":
|
| 539 |
+
if successful_results:
|
| 540 |
+
latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
|
| 541 |
+
category_stats[category] = {
|
| 542 |
+
"average_latency": sum(latencies) / len(latencies),
|
| 543 |
+
"std_deviation": self._calculate_std(latencies),
|
| 544 |
+
"min_latency": min(latencies),
|
| 545 |
+
"max_latency": max(latencies),
|
| 546 |
+
"query_count": len(latencies),
|
| 547 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies),
|
| 548 |
+
"individual_latencies": latencies
|
| 549 |
+
}
|
| 550 |
+
else:
|
| 551 |
+
category_stats[category] = self._get_empty_latency_stats()
|
| 552 |
+
|
| 553 |
+
elif metric_name == "extraction":
|
| 554 |
+
extraction_successes = [r['extraction_metrics']['extraction_success'] for r in results]
|
| 555 |
+
successful_extractions = sum(extraction_successes)
|
| 556 |
+
|
| 557 |
+
category_stats[category] = {
|
| 558 |
+
"success_rate": successful_extractions / len(results) if results else 0.0,
|
| 559 |
+
"successful_count": successful_extractions,
|
| 560 |
+
"total_count": len(results),
|
| 561 |
+
"average_extraction_time": sum(r['extraction_metrics']['extraction_time'] for r in results) / len(results) if results else 0.0,
|
| 562 |
+
"meets_threshold": (successful_extractions / len(results)) >= 0.8 if results else False
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
elif metric_name == "relevance":
|
| 566 |
+
if successful_results:
|
| 567 |
+
relevance_scores = [r['relevance_metrics']['average_relevance'] for r in successful_results]
|
| 568 |
+
category_stats[category] = {
|
| 569 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
| 570 |
+
"max_relevance": max(relevance_scores),
|
| 571 |
+
"min_relevance": min(relevance_scores),
|
| 572 |
+
"successful_retrievals": len(successful_results),
|
| 573 |
+
"total_queries": len(results),
|
| 574 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
|
| 575 |
+
"individual_relevance_scores": relevance_scores
|
| 576 |
+
}
|
| 577 |
+
else:
|
| 578 |
+
category_stats[category] = self._get_empty_relevance_stats(len(results))
|
| 579 |
+
|
| 580 |
+
elif metric_name == "coverage":
|
| 581 |
+
if successful_results:
|
| 582 |
+
coverage_scores = [r['coverage_metrics']['coverage_score'] for r in successful_results]
|
| 583 |
+
category_stats[category] = {
|
| 584 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
| 585 |
+
"max_coverage": max(coverage_scores),
|
| 586 |
+
"min_coverage": min(coverage_scores),
|
| 587 |
+
"successful_evaluations": len(successful_results),
|
| 588 |
+
"total_queries": len(results),
|
| 589 |
+
"meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.4,
|
| 590 |
+
"individual_coverage_scores": coverage_scores
|
| 591 |
+
}
|
| 592 |
+
else:
|
| 593 |
+
category_stats[category] = self._get_empty_coverage_stats(len(results))
|
| 594 |
+
|
| 595 |
+
# Calculate overall statistics
|
| 596 |
+
overall_stats = self._calculate_overall_stats(metric_name, all_successful_results)
|
| 597 |
+
|
| 598 |
+
return {
|
| 599 |
+
"category_results": category_stats,
|
| 600 |
+
"overall_results": overall_stats,
|
| 601 |
+
"timestamp": datetime.now().isoformat()
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
def _calculate_std(self, values: List[float]) -> float:
|
| 605 |
+
"""Calculate standard deviation"""
|
| 606 |
+
if len(values) < 2:
|
| 607 |
+
return 0.0
|
| 608 |
+
|
| 609 |
+
mean = sum(values) / len(values)
|
| 610 |
+
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
| 611 |
+
return variance ** 0.5
|
| 612 |
+
|
| 613 |
+
def _get_empty_latency_stats(self) -> Dict[str, Any]:
|
| 614 |
+
"""Return empty latency statistics"""
|
| 615 |
+
return {
|
| 616 |
+
"average_latency": 0.0,
|
| 617 |
+
"std_deviation": 0.0,
|
| 618 |
+
"min_latency": 0.0,
|
| 619 |
+
"max_latency": 0.0,
|
| 620 |
+
"query_count": 0,
|
| 621 |
+
"target_compliance": 0.0,
|
| 622 |
+
"individual_latencies": []
|
| 623 |
+
}
|
| 624 |
+
|
| 625 |
+
def _get_empty_relevance_stats(self, total_queries: int) -> Dict[str, Any]:
|
| 626 |
+
"""Return empty relevance statistics"""
|
| 627 |
+
return {
|
| 628 |
+
"average_relevance": 0.0,
|
| 629 |
+
"max_relevance": 0.0,
|
| 630 |
+
"min_relevance": 0.0,
|
| 631 |
+
"successful_retrievals": 0,
|
| 632 |
+
"total_queries": total_queries,
|
| 633 |
+
"meets_threshold": False,
|
| 634 |
+
"individual_relevance_scores": []
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
def _get_empty_coverage_stats(self, total_queries: int) -> Dict[str, Any]:
|
| 638 |
+
"""Return empty coverage statistics"""
|
| 639 |
+
return {
|
| 640 |
+
"average_coverage": 0.0,
|
| 641 |
+
"max_coverage": 0.0,
|
| 642 |
+
"min_coverage": 0.0,
|
| 643 |
+
"successful_evaluations": 0,
|
| 644 |
+
"total_queries": total_queries,
|
| 645 |
+
"meets_threshold": False,
|
| 646 |
+
"individual_coverage_scores": []
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
def _calculate_overall_stats(self, metric_name: str, all_successful_results: List[Dict]) -> Dict[str, Any]:
|
| 650 |
+
"""Calculate overall statistics for a specific metric"""
|
| 651 |
+
total_queries = len(self.comprehensive_results)
|
| 652 |
+
|
| 653 |
+
if metric_name == "latency" and all_successful_results:
|
| 654 |
+
latencies = [r['latency_metrics']['total_latency'] for r in all_successful_results]
|
| 655 |
+
return {
|
| 656 |
+
"average_latency": sum(latencies) / len(latencies),
|
| 657 |
+
"std_deviation": self._calculate_std(latencies),
|
| 658 |
+
"min_latency": min(latencies),
|
| 659 |
+
"max_latency": max(latencies),
|
| 660 |
+
"successful_queries": len(all_successful_results),
|
| 661 |
+
"total_queries": total_queries,
|
| 662 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
|
| 663 |
+
}
|
| 664 |
+
|
| 665 |
+
elif metric_name == "extraction":
|
| 666 |
+
all_extractions = [r['extraction_metrics']['extraction_success'] for r in self.comprehensive_results]
|
| 667 |
+
successful_extractions = sum(all_extractions)
|
| 668 |
+
return {
|
| 669 |
+
"success_rate": successful_extractions / len(all_extractions) if all_extractions else 0.0,
|
| 670 |
+
"successful_count": successful_extractions,
|
| 671 |
+
"total_count": len(all_extractions),
|
| 672 |
+
"target_compliance": (successful_extractions / len(all_extractions)) >= 0.8 if all_extractions else False
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
elif metric_name == "relevance" and all_successful_results:
|
| 676 |
+
relevance_scores = [r['relevance_metrics']['average_relevance'] for r in all_successful_results]
|
| 677 |
+
return {
|
| 678 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
| 679 |
+
"max_relevance": max(relevance_scores),
|
| 680 |
+
"min_relevance": min(relevance_scores),
|
| 681 |
+
"successful_queries": len(all_successful_results),
|
| 682 |
+
"total_queries": total_queries,
|
| 683 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
|
| 684 |
+
"target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.7
|
| 685 |
+
}
|
| 686 |
+
|
| 687 |
+
elif metric_name == "coverage" and all_successful_results:
|
| 688 |
+
coverage_scores = [r['coverage_metrics']['coverage_score'] for r in all_successful_results]
|
| 689 |
+
return {
|
| 690 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
| 691 |
+
"max_coverage": max(coverage_scores),
|
| 692 |
+
"min_coverage": min(coverage_scores),
|
| 693 |
+
"successful_queries": len(all_successful_results),
|
| 694 |
+
"total_queries": total_queries,
|
| 695 |
+
"meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.4
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
# Return empty stats for failed cases
|
| 699 |
+
return {
|
| 700 |
+
"average_value": 0.0,
|
| 701 |
+
"successful_queries": len(all_successful_results),
|
| 702 |
+
"total_queries": total_queries,
|
| 703 |
+
"meets_threshold": False
|
| 704 |
+
}
|
| 705 |
+
def save_all_metric_statistics(self) -> Dict[str, str]:
|
| 706 |
+
"""Save separate statistics files for each metric"""
|
| 707 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 708 |
+
|
| 709 |
+
# Ensure results directory exists
|
| 710 |
+
results_dir = Path(__file__).parent / "results"
|
| 711 |
+
results_dir.mkdir(exist_ok=True)
|
| 712 |
+
|
| 713 |
+
saved_files = {}
|
| 714 |
+
|
| 715 |
+
# Save statistics for each metric
|
| 716 |
+
for metric_name in ["latency", "extraction", "relevance", "coverage"]:
|
| 717 |
+
stats = self.calculate_metric_statistics(metric_name)
|
| 718 |
+
filename = f"{metric_name}_statistics_{timestamp}.json"
|
| 719 |
+
filepath = results_dir / filename
|
| 720 |
+
|
| 721 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 722 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 723 |
+
|
| 724 |
+
saved_files[metric_name] = str(filepath)
|
| 725 |
+
print(f"📊 {metric_name.capitalize()} statistics saved to: {filepath}")
|
| 726 |
+
|
| 727 |
+
return saved_files
|
| 728 |
+
|
| 729 |
+
def save_medical_outputs(self, filename: str = None) -> str:
|
| 730 |
+
"""Save medical advice outputs for model comparison"""
|
| 731 |
+
if filename is None:
|
| 732 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 733 |
+
filename = f"medical_outputs_{timestamp}.json"
|
| 734 |
+
|
| 735 |
+
# Ensure results directory exists
|
| 736 |
+
results_dir = Path(__file__).parent / "results"
|
| 737 |
+
results_dir.mkdir(exist_ok=True)
|
| 738 |
+
|
| 739 |
+
filepath = results_dir / filename
|
| 740 |
+
|
| 741 |
+
# Create comprehensive output data
|
| 742 |
+
output_data = {
|
| 743 |
+
"evaluation_metadata": {
|
| 744 |
+
"total_outputs": len(self.medical_outputs),
|
| 745 |
+
"categories": list(set(output['category'] for output in self.medical_outputs)),
|
| 746 |
+
"timestamp": datetime.now().isoformat(),
|
| 747 |
+
"model_type": "Med42-70B_RAG_enhanced" # For future comparison
|
| 748 |
+
},
|
| 749 |
+
"medical_outputs": self.medical_outputs
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 753 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
| 754 |
+
|
| 755 |
+
print(f"📝 Medical outputs saved to: {filepath}")
|
| 756 |
+
return str(filepath)
|
| 757 |
+
|
| 758 |
+
def save_comprehensive_details(self, filename: str = None) -> str:
|
| 759 |
+
"""Save comprehensive detailed results"""
|
| 760 |
+
if filename is None:
|
| 761 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 762 |
+
filename = f"comprehensive_details_{timestamp}.json"
|
| 763 |
+
|
| 764 |
+
# Ensure results directory exists
|
| 765 |
+
results_dir = Path(__file__).parent / "results"
|
| 766 |
+
results_dir.mkdir(exist_ok=True)
|
| 767 |
+
|
| 768 |
+
filepath = results_dir / filename
|
| 769 |
+
|
| 770 |
+
# Create comprehensive evaluation data
|
| 771 |
+
comprehensive_data = {
|
| 772 |
+
"evaluation_metadata": {
|
| 773 |
+
"total_queries": len(self.comprehensive_results),
|
| 774 |
+
"successful_queries": len([r for r in self.comprehensive_results if r.get('overall_success')]),
|
| 775 |
+
"timestamp": datetime.now().isoformat(),
|
| 776 |
+
"evaluator_type": "comprehensive_metrics_1_to_4",
|
| 777 |
+
"metrics_evaluated": ["latency", "extraction", "relevance", "coverage"]
|
| 778 |
+
},
|
| 779 |
+
"comprehensive_results": self.comprehensive_results
|
| 780 |
+
}
|
| 781 |
+
|
| 782 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 783 |
+
json.dump(comprehensive_data, f, indent=2, ensure_ascii=False)
|
| 784 |
+
|
| 785 |
+
print(f"📋 Comprehensive details saved to: {filepath}")
|
| 786 |
+
return str(filepath)
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
# Independent execution interface
|
| 790 |
+
if __name__ == "__main__":
|
| 791 |
+
"""Independent comprehensive evaluation interface"""
|
| 792 |
+
|
| 793 |
+
print("🚀 OnCall.ai Comprehensive Evaluator - Metrics 1-4 in Single Run")
|
| 794 |
+
|
| 795 |
+
if len(sys.argv) > 1:
|
| 796 |
+
query_file = sys.argv[1]
|
| 797 |
+
else:
|
| 798 |
+
# Default to evaluation/single_test_query.txt for initial testing
|
| 799 |
+
query_file = Path(__file__).parent / "single_test_query.txt"
|
| 800 |
+
|
| 801 |
+
if not os.path.exists(query_file):
|
| 802 |
+
print(f"❌ Query file not found: {query_file}")
|
| 803 |
+
print("Usage: python latency_evaluator.py [query_file.txt]")
|
| 804 |
+
sys.exit(1)
|
| 805 |
+
|
| 806 |
+
# Initialize evaluator
|
| 807 |
+
evaluator = ComprehensiveEvaluator()
|
| 808 |
+
|
| 809 |
+
# Parse queries from file
|
| 810 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
| 811 |
+
|
| 812 |
+
if "error" in queries_by_category:
|
| 813 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
| 814 |
+
sys.exit(1)
|
| 815 |
+
|
| 816 |
+
# Test each query comprehensively
|
| 817 |
+
print(f"\n🧪 Comprehensive Evaluation - All Metrics in Single Run")
|
| 818 |
+
print(f"📊 Collecting metrics 1-4 from single app.py pipeline execution")
|
| 819 |
+
|
| 820 |
+
for category, queries in queries_by_category.items():
|
| 821 |
+
if not queries:
|
| 822 |
+
continue
|
| 823 |
+
|
| 824 |
+
print(f"\n📂 Testing {category.upper()} queries:")
|
| 825 |
+
|
| 826 |
+
for i, query_info in enumerate(queries):
|
| 827 |
+
query_text = query_info['text']
|
| 828 |
+
print(f"\n🔍 Query {i+1}/{len(queries)} in {category} category:")
|
| 829 |
+
print(f" Text: {query_text}")
|
| 830 |
+
|
| 831 |
+
# Comprehensive evaluation (collects all metrics 1-4)
|
| 832 |
+
result = evaluator.evaluate_single_query_comprehensive(query_text, category)
|
| 833 |
+
|
| 834 |
+
# Pause between queries to avoid rate limits
|
| 835 |
+
if i < len(queries) - 1:
|
| 836 |
+
print(f" ⏳ Pausing 5s before next query...")
|
| 837 |
+
time.sleep(5)
|
| 838 |
+
|
| 839 |
+
# Longer pause between categories
|
| 840 |
+
if category != list(queries_by_category.keys())[-1]:
|
| 841 |
+
print(f"\n⏳ Pausing 10s before next category...")
|
| 842 |
+
time.sleep(10)
|
| 843 |
+
|
| 844 |
+
# Generate and save all metric statistics
|
| 845 |
+
print(f"\n📊 Generating comprehensive analysis for all metrics...")
|
| 846 |
+
|
| 847 |
+
# Save separate statistics for each metric
|
| 848 |
+
saved_stats = evaluator.save_all_metric_statistics()
|
| 849 |
+
|
| 850 |
+
# Save medical outputs for model comparison
|
| 851 |
+
outputs_path = evaluator.save_medical_outputs()
|
| 852 |
+
|
| 853 |
+
# Save comprehensive details
|
| 854 |
+
details_path = evaluator.save_comprehensive_details()
|
| 855 |
+
|
| 856 |
+
# Print comprehensive summary
|
| 857 |
+
print(f"\n📊 === COMPREHENSIVE EVALUATION SUMMARY ===")
|
| 858 |
+
|
| 859 |
+
for metric_name in ["latency", "extraction", "relevance", "coverage"]:
|
| 860 |
+
stats = evaluator.calculate_metric_statistics(metric_name)
|
| 861 |
+
overall_results = stats['overall_results']
|
| 862 |
+
|
| 863 |
+
print(f"\n{metric_name.upper()} METRICS:")
|
| 864 |
+
|
| 865 |
+
if metric_name == "latency":
|
| 866 |
+
print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
| 867 |
+
print(f" 60s Target: {'✅ Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
|
| 868 |
+
|
| 869 |
+
elif metric_name == "extraction":
|
| 870 |
+
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
| 871 |
+
print(f" 80% Target: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
|
| 872 |
+
|
| 873 |
+
elif metric_name == "relevance":
|
| 874 |
+
print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
|
| 875 |
+
print(f" 0.70 Target: {'✅ Met' if overall_results.get('target_compliance', False) else '❌ Not Met'}")
|
| 876 |
+
|
| 877 |
+
elif metric_name == "coverage":
|
| 878 |
+
print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
|
| 879 |
+
print(f" 40% Target: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
|
| 880 |
+
|
| 881 |
+
print(f"\n✅ Comprehensive evaluation complete! Files saved:")
|
| 882 |
+
for metric_name, filepath in saved_stats.items():
|
| 883 |
+
print(f" 📊 {metric_name.capitalize()}: {filepath}")
|
| 884 |
+
print(f" 📝 Medical Outputs: {outputs_path}")
|
| 885 |
+
print(f" 📋 Comprehensive Details: {details_path}")
|
| 886 |
+
print(f"\n💡 Next step: Run downstream evaluators for metrics 5-8")
|
| 887 |
+
print(f" python metric5_6_llm_judge_evaluator.py rag")
|
| 888 |
+
print(f" python metric7_8_precision_MRR.py {details_path}")
|
| 889 |
+
print(f" python latency_chart_generator.py")
|
| 890 |
+
print(f" python extraction_chart_generator.py # (create separately)")
|
| 891 |
+
print(f" python relevance_chart_generator.py # (create separately)")
|
| 892 |
+
print(f" python coverage_chart_generator.py # (create separately)")
|
evaluation/metric1_latency_chart_generator.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Latency Chart Generator
|
| 4 |
+
==========================================
|
| 5 |
+
|
| 6 |
+
Generates comprehensive latency analysis charts from saved statistics.
|
| 7 |
+
Reads JSON files produced by latency_evaluator.py and creates visualizations.
|
| 8 |
+
|
| 9 |
+
No LLM calls - pure data visualization.
|
| 10 |
+
|
| 11 |
+
Author: YanBo Chen
|
| 12 |
+
Date: 2025-08-04
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
import sys
|
| 18 |
+
from typing import Dict, List, Any
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
import glob
|
| 22 |
+
|
| 23 |
+
# Visualization imports
|
| 24 |
+
import matplotlib.pyplot as plt
|
| 25 |
+
import seaborn as sns
|
| 26 |
+
import pandas as pd
|
| 27 |
+
import numpy as np
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class LatencyChartGenerator:
|
| 31 |
+
"""Generate charts from latency evaluation statistics - no LLM dependency"""
|
| 32 |
+
|
| 33 |
+
def __init__(self):
|
| 34 |
+
"""Initialize chart generator"""
|
| 35 |
+
print("📈 Initializing Latency Chart Generator...")
|
| 36 |
+
|
| 37 |
+
# Set up professional chart style
|
| 38 |
+
plt.style.use('default')
|
| 39 |
+
sns.set_palette("husl")
|
| 40 |
+
|
| 41 |
+
print("✅ Chart Generator ready")
|
| 42 |
+
|
| 43 |
+
def load_latest_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 44 |
+
"""
|
| 45 |
+
Load the most recent latency statistics file
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
results_dir: Directory containing statistics files
|
| 49 |
+
"""
|
| 50 |
+
if results_dir is None:
|
| 51 |
+
results_dir = Path(__file__).parent / "results"
|
| 52 |
+
|
| 53 |
+
# Find latest statistics file
|
| 54 |
+
pattern = str(results_dir / "latency_statistics_*.json")
|
| 55 |
+
stat_files = glob.glob(pattern)
|
| 56 |
+
|
| 57 |
+
if not stat_files:
|
| 58 |
+
raise FileNotFoundError(f"No latency statistics files found in {results_dir}")
|
| 59 |
+
|
| 60 |
+
# Get the most recent file
|
| 61 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 62 |
+
|
| 63 |
+
print(f"📊 Loading statistics from: {latest_file}")
|
| 64 |
+
|
| 65 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 66 |
+
stats = json.load(f)
|
| 67 |
+
|
| 68 |
+
return stats
|
| 69 |
+
|
| 70 |
+
def generate_comprehensive_charts(self, stats: Dict[str, Any]) -> str:
|
| 71 |
+
"""
|
| 72 |
+
Generate comprehensive 4-category latency analysis charts
|
| 73 |
+
|
| 74 |
+
Creates professional charts showing:
|
| 75 |
+
1. Category comparison bar chart
|
| 76 |
+
2. Individual query scatter plot
|
| 77 |
+
3. Statistical summary table
|
| 78 |
+
4. Performance distribution box plot
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
# Create figure with subplots
|
| 82 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 83 |
+
fig.suptitle('OnCall.ai Latency Analysis - Category Comparison',
|
| 84 |
+
fontsize=16, fontweight='bold')
|
| 85 |
+
|
| 86 |
+
category_results = stats['category_results']
|
| 87 |
+
overall_results = stats['overall_results']
|
| 88 |
+
|
| 89 |
+
# Chart 1: Category Comparison Bar Chart
|
| 90 |
+
ax1 = axes[0, 0]
|
| 91 |
+
categories = []
|
| 92 |
+
avg_latencies = []
|
| 93 |
+
std_devs = []
|
| 94 |
+
|
| 95 |
+
# Collect category data
|
| 96 |
+
for category, cat_stats in category_results.items():
|
| 97 |
+
if cat_stats['query_count'] > 0:
|
| 98 |
+
categories.append(category.replace('_', ' ').title())
|
| 99 |
+
avg_latencies.append(cat_stats['average_latency'])
|
| 100 |
+
std_devs.append(cat_stats['std_deviation'])
|
| 101 |
+
|
| 102 |
+
# Add overall
|
| 103 |
+
categories.append('Overall')
|
| 104 |
+
avg_latencies.append(overall_results['average_latency'])
|
| 105 |
+
std_devs.append(overall_results['std_deviation'])
|
| 106 |
+
|
| 107 |
+
# Create bar chart with error bars
|
| 108 |
+
bars = ax1.bar(categories, avg_latencies, capsize=5, alpha=0.8,
|
| 109 |
+
color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
| 110 |
+
ax1.errorbar(categories, avg_latencies, yerr=std_devs, fmt='none',
|
| 111 |
+
color='black', capsize=3, capthick=1)
|
| 112 |
+
|
| 113 |
+
ax1.set_title('Average Latency by Category', fontweight='bold')
|
| 114 |
+
ax1.set_ylabel('Latency (seconds)')
|
| 115 |
+
ax1.set_xlabel('Query Category')
|
| 116 |
+
ax1.grid(True, alpha=0.3)
|
| 117 |
+
|
| 118 |
+
# Add value labels on bars
|
| 119 |
+
for bar, avg, std in zip(bars, avg_latencies, std_devs):
|
| 120 |
+
height = bar.get_height()
|
| 121 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + std*0.1,
|
| 122 |
+
f'{avg:.1f}s', ha='center', va='bottom', fontweight='bold')
|
| 123 |
+
|
| 124 |
+
# Add target line
|
| 125 |
+
ax1.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
| 126 |
+
ax1.legend()
|
| 127 |
+
|
| 128 |
+
# Chart 2: Individual Query Performance
|
| 129 |
+
ax2 = axes[0, 1]
|
| 130 |
+
|
| 131 |
+
query_indices = []
|
| 132 |
+
latencies = []
|
| 133 |
+
colors = []
|
| 134 |
+
|
| 135 |
+
color_map = {'diagnosis': '#1f77b4', 'treatment': '#ff7f0e', 'mixed': '#d62728'}
|
| 136 |
+
query_idx = 0
|
| 137 |
+
|
| 138 |
+
for category, cat_stats in category_results.items():
|
| 139 |
+
for latency in cat_stats['individual_latencies']:
|
| 140 |
+
query_indices.append(query_idx)
|
| 141 |
+
latencies.append(latency)
|
| 142 |
+
colors.append(color_map.get(category, 'gray'))
|
| 143 |
+
query_idx += 1
|
| 144 |
+
|
| 145 |
+
if latencies:
|
| 146 |
+
ax2.scatter(query_indices, latencies, c=colors, alpha=0.7, s=100)
|
| 147 |
+
ax2.set_title('Individual Query Performance', fontweight='bold')
|
| 148 |
+
ax2.set_ylabel('Latency (seconds)')
|
| 149 |
+
ax2.set_xlabel('Query Index')
|
| 150 |
+
ax2.grid(True, alpha=0.3)
|
| 151 |
+
|
| 152 |
+
# Add target line
|
| 153 |
+
ax2.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
| 154 |
+
|
| 155 |
+
# Add category legend
|
| 156 |
+
from matplotlib.patches import Patch
|
| 157 |
+
legend_elements = [Patch(facecolor=color_map[cat], label=cat.title())
|
| 158 |
+
for cat in color_map.keys() if cat in category_results.keys()]
|
| 159 |
+
ax2.legend(handles=legend_elements)
|
| 160 |
+
else:
|
| 161 |
+
ax2.text(0.5, 0.5, 'No latency data available',
|
| 162 |
+
ha='center', va='center', transform=ax2.transAxes)
|
| 163 |
+
ax2.set_title('Individual Query Performance', fontweight='bold')
|
| 164 |
+
|
| 165 |
+
# Chart 3: Statistical Summary Table
|
| 166 |
+
ax3 = axes[1, 0]
|
| 167 |
+
ax3.axis('tight')
|
| 168 |
+
ax3.axis('off')
|
| 169 |
+
|
| 170 |
+
# Create summary table
|
| 171 |
+
table_data = []
|
| 172 |
+
headers = ['Category', 'Avg (s)', 'Std (s)', 'Min (s)', 'Max (s)', 'Count']
|
| 173 |
+
|
| 174 |
+
for category, cat_stats in category_results.items():
|
| 175 |
+
if cat_stats['query_count'] > 0:
|
| 176 |
+
table_data.append([
|
| 177 |
+
category.replace('_', ' ').title(),
|
| 178 |
+
f"{cat_stats['average_latency']:.2f}",
|
| 179 |
+
f"{cat_stats['std_deviation']:.2f}",
|
| 180 |
+
f"{cat_stats['min_latency']:.2f}",
|
| 181 |
+
f"{cat_stats['max_latency']:.2f}",
|
| 182 |
+
str(cat_stats['query_count'])
|
| 183 |
+
])
|
| 184 |
+
|
| 185 |
+
# Add overall row
|
| 186 |
+
table_data.append([
|
| 187 |
+
'Overall',
|
| 188 |
+
f"{overall_results['average_latency']:.2f}",
|
| 189 |
+
f"{overall_results['std_deviation']:.2f}",
|
| 190 |
+
f"{overall_results['min_latency']:.2f}",
|
| 191 |
+
f"{overall_results['max_latency']:.2f}",
|
| 192 |
+
str(overall_results['successful_queries'])
|
| 193 |
+
])
|
| 194 |
+
|
| 195 |
+
if table_data:
|
| 196 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
| 197 |
+
cellLoc='center', loc='center',
|
| 198 |
+
colWidths=[0.2, 0.15, 0.15, 0.15, 0.15, 0.1])
|
| 199 |
+
table.auto_set_font_size(False)
|
| 200 |
+
table.set_fontsize(10)
|
| 201 |
+
table.scale(1, 2)
|
| 202 |
+
|
| 203 |
+
# Style the table header
|
| 204 |
+
for i in range(len(headers)):
|
| 205 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 206 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
| 207 |
+
|
| 208 |
+
ax3.set_title('Statistical Summary', fontweight='bold', pad=20)
|
| 209 |
+
|
| 210 |
+
# Chart 4: Performance Distribution
|
| 211 |
+
ax4 = axes[1, 1]
|
| 212 |
+
|
| 213 |
+
# Create box plot if we have multiple data points
|
| 214 |
+
box_data = []
|
| 215 |
+
box_labels = []
|
| 216 |
+
|
| 217 |
+
for category, cat_stats in category_results.items():
|
| 218 |
+
if cat_stats['individual_latencies'] and len(cat_stats['individual_latencies']) > 0:
|
| 219 |
+
box_data.append(cat_stats['individual_latencies'])
|
| 220 |
+
box_labels.append(category.replace('_', ' ').title())
|
| 221 |
+
|
| 222 |
+
if box_data and len(box_data) > 0:
|
| 223 |
+
box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
|
| 224 |
+
|
| 225 |
+
# Color the boxes
|
| 226 |
+
colors = ['#1f77b4', '#ff7f0e', '#d62728']
|
| 227 |
+
for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
|
| 228 |
+
patch.set_facecolor(color)
|
| 229 |
+
patch.set_alpha(0.7)
|
| 230 |
+
|
| 231 |
+
ax4.set_title('Latency Distribution by Category', fontweight='bold')
|
| 232 |
+
ax4.set_ylabel('Latency (seconds)')
|
| 233 |
+
ax4.grid(True, alpha=0.3)
|
| 234 |
+
|
| 235 |
+
# Add target line
|
| 236 |
+
ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
| 237 |
+
ax4.legend()
|
| 238 |
+
else:
|
| 239 |
+
# For single data points, show a simple bar chart
|
| 240 |
+
single_categories = []
|
| 241 |
+
single_latencies = []
|
| 242 |
+
|
| 243 |
+
for category, cat_stats in category_results.items():
|
| 244 |
+
if cat_stats['query_count'] > 0:
|
| 245 |
+
single_categories.append(category.replace('_', ' ').title())
|
| 246 |
+
single_latencies.append(cat_stats['average_latency'])
|
| 247 |
+
|
| 248 |
+
if single_categories:
|
| 249 |
+
ax4.bar(single_categories, single_latencies, alpha=0.7,
|
| 250 |
+
color=['#1f77b4', '#ff7f0e', '#d62728'][:len(single_categories)])
|
| 251 |
+
ax4.set_title('Category Latency (Single Query Each)', fontweight='bold')
|
| 252 |
+
ax4.set_ylabel('Latency (seconds)')
|
| 253 |
+
ax4.grid(True, alpha=0.3)
|
| 254 |
+
ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
| 255 |
+
ax4.legend()
|
| 256 |
+
else:
|
| 257 |
+
ax4.text(0.5, 0.5, 'No data available for distribution plot',
|
| 258 |
+
ha='center', va='center', transform=ax4.transAxes)
|
| 259 |
+
ax4.set_title('Latency Distribution', fontweight='bold')
|
| 260 |
+
|
| 261 |
+
# Adjust layout and save
|
| 262 |
+
plt.tight_layout()
|
| 263 |
+
|
| 264 |
+
# Save chart
|
| 265 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 266 |
+
chart_filename = f"latency_analysis_charts_{timestamp}.png"
|
| 267 |
+
|
| 268 |
+
# Ensure results directory exists
|
| 269 |
+
results_dir = Path(__file__).parent / "results"
|
| 270 |
+
results_dir.mkdir(exist_ok=True)
|
| 271 |
+
chart_path = results_dir / chart_filename
|
| 272 |
+
|
| 273 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight',
|
| 274 |
+
facecolor='white', edgecolor='none')
|
| 275 |
+
plt.close()
|
| 276 |
+
|
| 277 |
+
print(f"📈 Charts saved to: {chart_path}")
|
| 278 |
+
return str(chart_path)
|
| 279 |
+
|
| 280 |
+
except Exception as e:
|
| 281 |
+
print(f"❌ Chart generation failed: {e}")
|
| 282 |
+
return ""
|
| 283 |
+
|
| 284 |
+
def print_statistics_summary(self, stats: Dict[str, Any]):
|
| 285 |
+
"""Print formatted statistics summary to console"""
|
| 286 |
+
category_results = stats['category_results']
|
| 287 |
+
overall_results = stats['overall_results']
|
| 288 |
+
|
| 289 |
+
print(f"\n📊 === LATENCY ANALYSIS CHART SUMMARY ===")
|
| 290 |
+
print(f"Overall Performance:")
|
| 291 |
+
print(f" Average Latency: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
| 292 |
+
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
| 293 |
+
print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
|
| 294 |
+
|
| 295 |
+
print(f"\nCategory Breakdown:")
|
| 296 |
+
for category, cat_stats in category_results.items():
|
| 297 |
+
if cat_stats['query_count'] > 0:
|
| 298 |
+
print(f" {category.capitalize()}: {cat_stats['average_latency']:.2f}s (±{cat_stats['std_deviation']:.2f}) [{cat_stats['query_count']} queries]")
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
# Independent execution interface
|
| 302 |
+
if __name__ == "__main__":
|
| 303 |
+
"""Independent chart generation interface"""
|
| 304 |
+
|
| 305 |
+
print("📈 OnCall.ai Latency Chart Generator")
|
| 306 |
+
|
| 307 |
+
# Initialize chart generator
|
| 308 |
+
chart_gen = LatencyChartGenerator()
|
| 309 |
+
|
| 310 |
+
try:
|
| 311 |
+
# Load latest statistics
|
| 312 |
+
stats = chart_gen.load_latest_statistics()
|
| 313 |
+
|
| 314 |
+
# Generate charts
|
| 315 |
+
chart_path = chart_gen.generate_comprehensive_charts(stats)
|
| 316 |
+
|
| 317 |
+
# Print summary
|
| 318 |
+
chart_gen.print_statistics_summary(stats)
|
| 319 |
+
|
| 320 |
+
print(f"\n✅ Chart generation complete!")
|
| 321 |
+
print(f"📈 Charts saved to: {chart_path}")
|
| 322 |
+
|
| 323 |
+
except FileNotFoundError as e:
|
| 324 |
+
print(f"❌ {e}")
|
| 325 |
+
print("💡 Please run latency_evaluator.py first to generate statistics data")
|
| 326 |
+
except Exception as e:
|
| 327 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric2_extraction_chart_generator.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Extraction Chart Generator
|
| 4 |
+
============================================
|
| 5 |
+
|
| 6 |
+
Generates extraction success rate charts from saved statistics.
|
| 7 |
+
Reads JSON files produced by comprehensive evaluator.
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import glob
|
| 20 |
+
|
| 21 |
+
# Visualization imports
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ExtractionChartGenerator:
|
| 29 |
+
"""Generate charts for condition extraction metrics"""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
"""Initialize chart generator"""
|
| 33 |
+
print("📈 Initializing Extraction Chart Generator...")
|
| 34 |
+
plt.style.use('default')
|
| 35 |
+
sns.set_palette("husl")
|
| 36 |
+
print("✅ Chart Generator ready")
|
| 37 |
+
|
| 38 |
+
def load_latest_extraction_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 39 |
+
"""Load the most recent extraction statistics file"""
|
| 40 |
+
if results_dir is None:
|
| 41 |
+
results_dir = Path(__file__).parent / "results"
|
| 42 |
+
|
| 43 |
+
pattern = str(results_dir / "extraction_statistics_*.json")
|
| 44 |
+
stat_files = glob.glob(pattern)
|
| 45 |
+
|
| 46 |
+
if not stat_files:
|
| 47 |
+
raise FileNotFoundError(f"No extraction statistics files found in {results_dir}")
|
| 48 |
+
|
| 49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 50 |
+
print(f"📊 Loading extraction statistics from: {latest_file}")
|
| 51 |
+
|
| 52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 53 |
+
stats = json.load(f)
|
| 54 |
+
|
| 55 |
+
return stats
|
| 56 |
+
|
| 57 |
+
def generate_extraction_charts(self, stats: Dict[str, Any]) -> str:
|
| 58 |
+
"""Generate extraction success rate analysis charts"""
|
| 59 |
+
try:
|
| 60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 61 |
+
fig.suptitle('OnCall.ai Extraction Success Rate Analysis', fontsize=16, fontweight='bold')
|
| 62 |
+
|
| 63 |
+
category_results = stats['category_results']
|
| 64 |
+
overall_results = stats['overall_results']
|
| 65 |
+
|
| 66 |
+
# Chart 1: Success Rate by Category
|
| 67 |
+
ax1 = axes[0, 0]
|
| 68 |
+
categories = []
|
| 69 |
+
success_rates = []
|
| 70 |
+
|
| 71 |
+
for category, cat_stats in category_results.items():
|
| 72 |
+
if cat_stats['total_count'] > 0:
|
| 73 |
+
categories.append(category.replace('_', ' ').title())
|
| 74 |
+
success_rates.append(cat_stats['success_rate'] * 100)
|
| 75 |
+
|
| 76 |
+
categories.append('Overall')
|
| 77 |
+
success_rates.append(overall_results['success_rate'] * 100)
|
| 78 |
+
|
| 79 |
+
bars = ax1.bar(categories, success_rates, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
| 80 |
+
ax1.set_title('Extraction Success Rate by Category', fontweight='bold')
|
| 81 |
+
ax1.set_ylabel('Success Rate (%)')
|
| 82 |
+
ax1.set_xlabel('Query Category')
|
| 83 |
+
ax1.grid(True, alpha=0.3)
|
| 84 |
+
|
| 85 |
+
# Add target line
|
| 86 |
+
ax1.axhline(y=80, color='red', linestyle='--', alpha=0.7, label='80% Target')
|
| 87 |
+
ax1.legend()
|
| 88 |
+
|
| 89 |
+
# Add value labels
|
| 90 |
+
for bar, rate in zip(bars, success_rates):
|
| 91 |
+
height = bar.get_height()
|
| 92 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
|
| 93 |
+
f'{rate:.1f}%', ha='center', va='bottom', fontweight='bold')
|
| 94 |
+
|
| 95 |
+
# Chart 2: Success Count
|
| 96 |
+
ax2 = axes[0, 1]
|
| 97 |
+
successful_counts = []
|
| 98 |
+
total_counts = []
|
| 99 |
+
|
| 100 |
+
for category, cat_stats in category_results.items():
|
| 101 |
+
if cat_stats['total_count'] > 0:
|
| 102 |
+
successful_counts.append(cat_stats['successful_count'])
|
| 103 |
+
total_counts.append(cat_stats['total_count'])
|
| 104 |
+
|
| 105 |
+
successful_counts.append(overall_results['successful_count'])
|
| 106 |
+
total_counts.append(overall_results['total_count'])
|
| 107 |
+
|
| 108 |
+
x = np.arange(len(categories))
|
| 109 |
+
width = 0.35
|
| 110 |
+
|
| 111 |
+
ax2.bar(x - width/2, successful_counts, width, label='Successful', alpha=0.8)
|
| 112 |
+
ax2.bar(x + width/2, total_counts, width, label='Total', alpha=0.8)
|
| 113 |
+
|
| 114 |
+
ax2.set_title('Extraction Success Count', fontweight='bold')
|
| 115 |
+
ax2.set_ylabel('Query Count')
|
| 116 |
+
ax2.set_xlabel('Query Category')
|
| 117 |
+
ax2.set_xticks(x)
|
| 118 |
+
ax2.set_xticklabels(categories)
|
| 119 |
+
ax2.legend()
|
| 120 |
+
ax2.grid(True, alpha=0.3)
|
| 121 |
+
|
| 122 |
+
# Chart 3: Statistical Summary Table
|
| 123 |
+
ax3 = axes[1, 0]
|
| 124 |
+
ax3.axis('tight')
|
| 125 |
+
ax3.axis('off')
|
| 126 |
+
|
| 127 |
+
table_data = []
|
| 128 |
+
headers = ['Category', 'Success Rate', 'Success/Total', 'Avg Time (s)', 'Target Met']
|
| 129 |
+
|
| 130 |
+
for category, cat_stats in category_results.items():
|
| 131 |
+
if cat_stats['total_count'] > 0:
|
| 132 |
+
table_data.append([
|
| 133 |
+
category.replace('_', ' ').title(),
|
| 134 |
+
f"{cat_stats['success_rate']:.1%}",
|
| 135 |
+
f"{cat_stats['successful_count']}/{cat_stats['total_count']}",
|
| 136 |
+
f"{cat_stats['average_extraction_time']:.3f}",
|
| 137 |
+
'✅' if cat_stats.get('meets_threshold', False) else '❌'
|
| 138 |
+
])
|
| 139 |
+
|
| 140 |
+
table_data.append([
|
| 141 |
+
'Overall',
|
| 142 |
+
f"{overall_results['success_rate']:.1%}",
|
| 143 |
+
f"{overall_results['successful_count']}/{overall_results['total_count']}",
|
| 144 |
+
'-',
|
| 145 |
+
'✅' if overall_results.get('target_compliance', False) else '❌'
|
| 146 |
+
])
|
| 147 |
+
|
| 148 |
+
if table_data:
|
| 149 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
| 150 |
+
cellLoc='center', loc='center')
|
| 151 |
+
table.auto_set_font_size(False)
|
| 152 |
+
table.set_fontsize(10)
|
| 153 |
+
table.scale(1, 2)
|
| 154 |
+
|
| 155 |
+
# Style header
|
| 156 |
+
for i in range(len(headers)):
|
| 157 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 158 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
| 159 |
+
|
| 160 |
+
ax3.set_title('Extraction Statistics Summary', fontweight='bold', pad=20)
|
| 161 |
+
|
| 162 |
+
# Chart 4: Performance visualization
|
| 163 |
+
ax4 = axes[1, 1]
|
| 164 |
+
|
| 165 |
+
# Simple performance indicator
|
| 166 |
+
overall_rate = overall_results['success_rate'] * 100
|
| 167 |
+
colors = ['#d62728' if overall_rate < 80 else '#2ca02c']
|
| 168 |
+
|
| 169 |
+
wedges, texts, autotexts = ax4.pie([overall_rate, 100-overall_rate],
|
| 170 |
+
labels=['Successful', 'Failed'],
|
| 171 |
+
autopct='%1.1f%%',
|
| 172 |
+
colors=['#2ca02c', '#ffcccc'],
|
| 173 |
+
startangle=90)
|
| 174 |
+
|
| 175 |
+
ax4.set_title(f'Overall Extraction Success\n{overall_rate:.1f}% Success Rate', fontweight='bold')
|
| 176 |
+
|
| 177 |
+
plt.tight_layout()
|
| 178 |
+
|
| 179 |
+
# Save chart
|
| 180 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 181 |
+
chart_filename = f"extraction_analysis_charts_{timestamp}.png"
|
| 182 |
+
|
| 183 |
+
results_dir = Path(__file__).parent / "results"
|
| 184 |
+
results_dir.mkdir(exist_ok=True)
|
| 185 |
+
chart_path = results_dir / chart_filename
|
| 186 |
+
|
| 187 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
| 188 |
+
plt.close()
|
| 189 |
+
|
| 190 |
+
print(f"📈 Extraction charts saved to: {chart_path}")
|
| 191 |
+
return str(chart_path)
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
print(f"❌ Extraction chart generation failed: {e}")
|
| 195 |
+
return ""
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
if __name__ == "__main__":
|
| 199 |
+
"""Independent extraction chart generation"""
|
| 200 |
+
|
| 201 |
+
print("📈 OnCall.ai Extraction Chart Generator")
|
| 202 |
+
|
| 203 |
+
chart_gen = ExtractionChartGenerator()
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
stats = chart_gen.load_latest_extraction_statistics()
|
| 207 |
+
chart_path = chart_gen.generate_extraction_charts(stats)
|
| 208 |
+
|
| 209 |
+
print(f"\n✅ Extraction chart generation complete!")
|
| 210 |
+
print(f"📈 Charts saved to: {chart_path}")
|
| 211 |
+
|
| 212 |
+
except FileNotFoundError as e:
|
| 213 |
+
print(f"❌ {e}")
|
| 214 |
+
print("💡 Please run latency_evaluator.py first to generate extraction statistics data")
|
| 215 |
+
except Exception as e:
|
| 216 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric3_relevance_chart_generator.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Relevance Chart Generator
|
| 4 |
+
============================================
|
| 5 |
+
|
| 6 |
+
Generates retrieval relevance charts from saved statistics.
|
| 7 |
+
Shows cosine similarity analysis and threshold compliance.
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import glob
|
| 20 |
+
|
| 21 |
+
# Visualization imports
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RelevanceChartGenerator:
|
| 29 |
+
"""Generate charts for retrieval relevance metrics"""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
"""Initialize chart generator"""
|
| 33 |
+
print("📈 Initializing Relevance Chart Generator...")
|
| 34 |
+
plt.style.use('default')
|
| 35 |
+
sns.set_palette("husl")
|
| 36 |
+
print("✅ Chart Generator ready")
|
| 37 |
+
|
| 38 |
+
def load_latest_relevance_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 39 |
+
"""Load the most recent relevance statistics file"""
|
| 40 |
+
if results_dir is None:
|
| 41 |
+
results_dir = Path(__file__).parent / "results"
|
| 42 |
+
|
| 43 |
+
pattern = str(results_dir / "relevance_statistics_*.json")
|
| 44 |
+
stat_files = glob.glob(pattern)
|
| 45 |
+
|
| 46 |
+
if not stat_files:
|
| 47 |
+
raise FileNotFoundError(f"No relevance statistics files found in {results_dir}")
|
| 48 |
+
|
| 49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 50 |
+
print(f"📊 Loading relevance statistics from: {latest_file}")
|
| 51 |
+
|
| 52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 53 |
+
stats = json.load(f)
|
| 54 |
+
|
| 55 |
+
return stats
|
| 56 |
+
|
| 57 |
+
def generate_relevance_charts(self, stats: Dict[str, Any]) -> str:
|
| 58 |
+
"""Generate relevance analysis charts"""
|
| 59 |
+
try:
|
| 60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 61 |
+
fig.suptitle('OnCall.ai Retrieval Relevance Analysis', fontsize=16, fontweight='bold')
|
| 62 |
+
|
| 63 |
+
category_results = stats['category_results']
|
| 64 |
+
overall_results = stats['overall_results']
|
| 65 |
+
|
| 66 |
+
# Chart 1: Average Relevance by Category
|
| 67 |
+
ax1 = axes[0, 0]
|
| 68 |
+
categories = []
|
| 69 |
+
avg_relevances = []
|
| 70 |
+
|
| 71 |
+
for category, cat_stats in category_results.items():
|
| 72 |
+
if cat_stats['successful_retrievals'] > 0:
|
| 73 |
+
categories.append(category.replace('_', ' ').title())
|
| 74 |
+
avg_relevances.append(cat_stats['average_relevance'])
|
| 75 |
+
|
| 76 |
+
categories.append('Overall')
|
| 77 |
+
avg_relevances.append(overall_results['average_relevance'])
|
| 78 |
+
|
| 79 |
+
bars = ax1.bar(categories, avg_relevances, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
| 80 |
+
ax1.set_title('Average Relevance Score by Category', fontweight='bold')
|
| 81 |
+
ax1.set_ylabel('Relevance Score (Cosine Similarity)')
|
| 82 |
+
ax1.set_xlabel('Query Category')
|
| 83 |
+
ax1.grid(True, alpha=0.3)
|
| 84 |
+
|
| 85 |
+
# Add threshold lines
|
| 86 |
+
ax1.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
| 87 |
+
ax1.axhline(y=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
|
| 88 |
+
ax1.legend()
|
| 89 |
+
|
| 90 |
+
# Add value labels
|
| 91 |
+
for bar, relevance in zip(bars, avg_relevances):
|
| 92 |
+
height = bar.get_height()
|
| 93 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 94 |
+
f'{relevance:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 95 |
+
|
| 96 |
+
# Chart 2: Relevance Distribution
|
| 97 |
+
ax2 = axes[0, 1]
|
| 98 |
+
|
| 99 |
+
# Collect all individual relevance scores
|
| 100 |
+
all_scores = []
|
| 101 |
+
category_labels = []
|
| 102 |
+
|
| 103 |
+
for category, cat_stats in category_results.items():
|
| 104 |
+
if cat_stats.get('individual_relevance_scores'):
|
| 105 |
+
all_scores.extend(cat_stats['individual_relevance_scores'])
|
| 106 |
+
category_labels.extend([category] * len(cat_stats['individual_relevance_scores']))
|
| 107 |
+
|
| 108 |
+
if all_scores:
|
| 109 |
+
# Create histogram
|
| 110 |
+
ax2.hist(all_scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
|
| 111 |
+
ax2.axvline(x=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
| 112 |
+
ax2.axvline(x=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
|
| 113 |
+
ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.3f}')
|
| 114 |
+
|
| 115 |
+
ax2.set_title('Relevance Score Distribution', fontweight='bold')
|
| 116 |
+
ax2.set_xlabel('Relevance Score')
|
| 117 |
+
ax2.set_ylabel('Frequency')
|
| 118 |
+
ax2.legend()
|
| 119 |
+
ax2.grid(True, alpha=0.3)
|
| 120 |
+
else:
|
| 121 |
+
ax2.text(0.5, 0.5, 'No relevance data available', ha='center', va='center', transform=ax2.transAxes)
|
| 122 |
+
ax2.set_title('Relevance Score Distribution', fontweight='bold')
|
| 123 |
+
|
| 124 |
+
# Chart 3: Statistical Summary Table
|
| 125 |
+
ax3 = axes[1, 0]
|
| 126 |
+
ax3.axis('tight')
|
| 127 |
+
ax3.axis('off')
|
| 128 |
+
|
| 129 |
+
table_data = []
|
| 130 |
+
headers = ['Category', 'Avg Relevance', 'Min/Max', 'Success/Total', 'Threshold Met']
|
| 131 |
+
|
| 132 |
+
for category, cat_stats in category_results.items():
|
| 133 |
+
if cat_stats['total_queries'] > 0:
|
| 134 |
+
table_data.append([
|
| 135 |
+
category.replace('_', ' ').title(),
|
| 136 |
+
f"{cat_stats['average_relevance']:.3f}",
|
| 137 |
+
f"{cat_stats['min_relevance']:.3f}/{cat_stats['max_relevance']:.3f}",
|
| 138 |
+
f"{cat_stats['successful_retrievals']}/{cat_stats['total_queries']}",
|
| 139 |
+
'✅' if cat_stats.get('meets_threshold', False) else '❌'
|
| 140 |
+
])
|
| 141 |
+
|
| 142 |
+
table_data.append([
|
| 143 |
+
'Overall',
|
| 144 |
+
f"{overall_results['average_relevance']:.3f}",
|
| 145 |
+
f"{overall_results['min_relevance']:.3f}/{overall_results['max_relevance']:.3f}",
|
| 146 |
+
f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
|
| 147 |
+
'✅' if overall_results.get('target_compliance', False) else '❌'
|
| 148 |
+
])
|
| 149 |
+
|
| 150 |
+
if table_data:
|
| 151 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
| 152 |
+
cellLoc='center', loc='center')
|
| 153 |
+
table.auto_set_font_size(False)
|
| 154 |
+
table.set_fontsize(10)
|
| 155 |
+
table.scale(1, 2)
|
| 156 |
+
|
| 157 |
+
# Style header
|
| 158 |
+
for i in range(len(headers)):
|
| 159 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 160 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
| 161 |
+
|
| 162 |
+
ax3.set_title('Relevance Statistics Summary', fontweight='bold', pad=20)
|
| 163 |
+
|
| 164 |
+
# Chart 4: Category Comparison Box Plot
|
| 165 |
+
ax4 = axes[1, 1]
|
| 166 |
+
|
| 167 |
+
box_data = []
|
| 168 |
+
box_labels = []
|
| 169 |
+
|
| 170 |
+
for category, cat_stats in category_results.items():
|
| 171 |
+
if cat_stats.get('individual_relevance_scores'):
|
| 172 |
+
box_data.append(cat_stats['individual_relevance_scores'])
|
| 173 |
+
box_labels.append(category.replace('_', ' ').title())
|
| 174 |
+
|
| 175 |
+
if box_data:
|
| 176 |
+
box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
|
| 177 |
+
colors = ['#1f77b4', '#ff7f0e', '#d62728']
|
| 178 |
+
for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
|
| 179 |
+
patch.set_facecolor(color)
|
| 180 |
+
patch.set_alpha(0.7)
|
| 181 |
+
|
| 182 |
+
ax4.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
| 183 |
+
ax4.axhline(y=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
|
| 184 |
+
ax4.set_title('Relevance Distribution by Category', fontweight='bold')
|
| 185 |
+
ax4.set_ylabel('Relevance Score')
|
| 186 |
+
ax4.legend()
|
| 187 |
+
ax4.grid(True, alpha=0.3)
|
| 188 |
+
else:
|
| 189 |
+
ax4.text(0.5, 0.5, 'Insufficient data for box plot', ha='center', va='center', transform=ax4.transAxes)
|
| 190 |
+
ax4.set_title('Relevance Distribution by Category', fontweight='bold')
|
| 191 |
+
|
| 192 |
+
plt.tight_layout()
|
| 193 |
+
|
| 194 |
+
# Save chart
|
| 195 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 196 |
+
chart_filename = f"relevance_analysis_charts_{timestamp}.png"
|
| 197 |
+
|
| 198 |
+
results_dir = Path(__file__).parent / "results"
|
| 199 |
+
results_dir.mkdir(exist_ok=True)
|
| 200 |
+
chart_path = results_dir / chart_filename
|
| 201 |
+
|
| 202 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
| 203 |
+
plt.close()
|
| 204 |
+
|
| 205 |
+
print(f"📈 Relevance charts saved to: {chart_path}")
|
| 206 |
+
return str(chart_path)
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"❌ Relevance chart generation failed: {e}")
|
| 210 |
+
return ""
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
"""Independent relevance chart generation"""
|
| 215 |
+
|
| 216 |
+
print("📈 OnCall.ai Relevance Chart Generator")
|
| 217 |
+
|
| 218 |
+
chart_gen = RelevanceChartGenerator()
|
| 219 |
+
|
| 220 |
+
try:
|
| 221 |
+
stats = chart_gen.load_latest_relevance_statistics()
|
| 222 |
+
chart_path = chart_gen.generate_relevance_charts(stats)
|
| 223 |
+
|
| 224 |
+
print(f"\n✅ Relevance chart generation complete!")
|
| 225 |
+
print(f"📈 Charts saved to: {chart_path}")
|
| 226 |
+
|
| 227 |
+
except FileNotFoundError as e:
|
| 228 |
+
print(f"❌ {e}")
|
| 229 |
+
print("💡 Please run latency_evaluator.py first to generate relevance statistics data")
|
| 230 |
+
except Exception as e:
|
| 231 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric4_coverage_chart_generator.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Coverage Chart Generator
|
| 4 |
+
===========================================
|
| 5 |
+
|
| 6 |
+
Generates retrieval coverage charts from saved statistics.
|
| 7 |
+
Shows how well generated advice utilizes retrieved content.
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import glob
|
| 20 |
+
|
| 21 |
+
# Visualization imports
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class CoverageChartGenerator:
|
| 29 |
+
"""Generate charts for retrieval coverage metrics"""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
"""Initialize chart generator"""
|
| 33 |
+
print("📈 Initializing Coverage Chart Generator...")
|
| 34 |
+
plt.style.use('default')
|
| 35 |
+
sns.set_palette("husl")
|
| 36 |
+
print("✅ Chart Generator ready")
|
| 37 |
+
|
| 38 |
+
def load_latest_coverage_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 39 |
+
"""Load the most recent coverage statistics file"""
|
| 40 |
+
if results_dir is None:
|
| 41 |
+
results_dir = Path(__file__).parent / "results"
|
| 42 |
+
|
| 43 |
+
pattern = str(results_dir / "coverage_statistics_*.json")
|
| 44 |
+
stat_files = glob.glob(pattern)
|
| 45 |
+
|
| 46 |
+
if not stat_files:
|
| 47 |
+
raise FileNotFoundError(f"No coverage statistics files found in {results_dir}")
|
| 48 |
+
|
| 49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 50 |
+
print(f"📊 Loading coverage statistics from: {latest_file}")
|
| 51 |
+
|
| 52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 53 |
+
stats = json.load(f)
|
| 54 |
+
|
| 55 |
+
return stats
|
| 56 |
+
|
| 57 |
+
def generate_coverage_charts(self, stats: Dict[str, Any]) -> str:
|
| 58 |
+
"""Generate coverage analysis charts"""
|
| 59 |
+
try:
|
| 60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 61 |
+
fig.suptitle('OnCall.ai Retrieval Coverage Analysis', fontsize=16, fontweight='bold')
|
| 62 |
+
|
| 63 |
+
category_results = stats['category_results']
|
| 64 |
+
overall_results = stats['overall_results']
|
| 65 |
+
|
| 66 |
+
# Chart 1: Average Coverage by Category
|
| 67 |
+
ax1 = axes[0, 0]
|
| 68 |
+
categories = []
|
| 69 |
+
avg_coverages = []
|
| 70 |
+
|
| 71 |
+
for category, cat_stats in category_results.items():
|
| 72 |
+
if cat_stats['successful_evaluations'] > 0:
|
| 73 |
+
categories.append(category.replace('_', ' ').title())
|
| 74 |
+
avg_coverages.append(cat_stats['average_coverage'] * 100) # Convert to percentage
|
| 75 |
+
|
| 76 |
+
categories.append('Overall')
|
| 77 |
+
avg_coverages.append(overall_results['average_coverage'] * 100)
|
| 78 |
+
|
| 79 |
+
bars = ax1.bar(categories, avg_coverages, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
| 80 |
+
ax1.set_title('Average Coverage Score by Category', fontweight='bold')
|
| 81 |
+
ax1.set_ylabel('Coverage Score (%)')
|
| 82 |
+
ax1.set_xlabel('Query Category')
|
| 83 |
+
ax1.grid(True, alpha=0.3)
|
| 84 |
+
|
| 85 |
+
# Add target line
|
| 86 |
+
ax1.axhline(y=40, color='red', linestyle='--', alpha=0.7, label='40% Target')
|
| 87 |
+
ax1.legend()
|
| 88 |
+
|
| 89 |
+
# Add value labels
|
| 90 |
+
for bar, coverage in zip(bars, avg_coverages):
|
| 91 |
+
height = bar.get_height()
|
| 92 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
|
| 93 |
+
f'{coverage:.1f}%', ha='center', va='bottom', fontweight='bold')
|
| 94 |
+
|
| 95 |
+
# Chart 2: Coverage Distribution
|
| 96 |
+
ax2 = axes[0, 1]
|
| 97 |
+
|
| 98 |
+
# Collect all individual coverage scores
|
| 99 |
+
all_scores = []
|
| 100 |
+
|
| 101 |
+
for category, cat_stats in category_results.items():
|
| 102 |
+
if cat_stats.get('individual_coverage_scores'):
|
| 103 |
+
all_scores.extend([score * 100 for score in cat_stats['individual_coverage_scores']])
|
| 104 |
+
|
| 105 |
+
if all_scores:
|
| 106 |
+
# Create histogram
|
| 107 |
+
ax2.hist(all_scores, bins=15, alpha=0.7, color='lightcoral', edgecolor='black')
|
| 108 |
+
ax2.axvline(x=40, color='red', linestyle='--', alpha=0.7, label='40% Target')
|
| 109 |
+
ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.1f}%')
|
| 110 |
+
|
| 111 |
+
ax2.set_title('Coverage Score Distribution', fontweight='bold')
|
| 112 |
+
ax2.set_xlabel('Coverage Score (%)')
|
| 113 |
+
ax2.set_ylabel('Frequency')
|
| 114 |
+
ax2.legend()
|
| 115 |
+
ax2.grid(True, alpha=0.3)
|
| 116 |
+
else:
|
| 117 |
+
ax2.text(0.5, 0.5, 'No coverage data available', ha='center', va='center', transform=ax2.transAxes)
|
| 118 |
+
ax2.set_title('Coverage Score Distribution', fontweight='bold')
|
| 119 |
+
|
| 120 |
+
# Chart 3: Statistical Summary Table
|
| 121 |
+
ax3 = axes[1, 0]
|
| 122 |
+
ax3.axis('tight')
|
| 123 |
+
ax3.axis('off')
|
| 124 |
+
|
| 125 |
+
table_data = []
|
| 126 |
+
headers = ['Category', 'Avg Coverage', 'Min/Max', 'Success/Total', 'Target Met']
|
| 127 |
+
|
| 128 |
+
for category, cat_stats in category_results.items():
|
| 129 |
+
if cat_stats['total_queries'] > 0:
|
| 130 |
+
table_data.append([
|
| 131 |
+
category.replace('_', ' ').title(),
|
| 132 |
+
f"{cat_stats['average_coverage']:.3f}",
|
| 133 |
+
f"{cat_stats['min_coverage']:.3f}/{cat_stats['max_coverage']:.3f}",
|
| 134 |
+
f"{cat_stats['successful_evaluations']}/{cat_stats['total_queries']}",
|
| 135 |
+
'✅' if cat_stats.get('meets_threshold', False) else '❌'
|
| 136 |
+
])
|
| 137 |
+
|
| 138 |
+
table_data.append([
|
| 139 |
+
'Overall',
|
| 140 |
+
f"{overall_results['average_coverage']:.3f}",
|
| 141 |
+
f"{overall_results['min_coverage']:.3f}/{overall_results['max_coverage']:.3f}",
|
| 142 |
+
f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
|
| 143 |
+
'✅' if overall_results.get('meets_threshold', False) else '❌'
|
| 144 |
+
])
|
| 145 |
+
|
| 146 |
+
if table_data:
|
| 147 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
| 148 |
+
cellLoc='center', loc='center')
|
| 149 |
+
table.auto_set_font_size(False)
|
| 150 |
+
table.set_fontsize(10)
|
| 151 |
+
table.scale(1, 2)
|
| 152 |
+
|
| 153 |
+
# Style header
|
| 154 |
+
for i in range(len(headers)):
|
| 155 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 156 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
| 157 |
+
|
| 158 |
+
ax3.set_title('Coverage Statistics Summary', fontweight='bold', pad=20)
|
| 159 |
+
|
| 160 |
+
# Chart 4: Coverage Performance Radar/Gauge
|
| 161 |
+
ax4 = axes[1, 1]
|
| 162 |
+
|
| 163 |
+
# Create gauge-like visualization for overall coverage
|
| 164 |
+
overall_coverage_pct = overall_results['average_coverage'] * 100
|
| 165 |
+
|
| 166 |
+
# Pie chart as gauge
|
| 167 |
+
sizes = [overall_coverage_pct, 100 - overall_coverage_pct]
|
| 168 |
+
colors = ['#2ca02c' if overall_coverage_pct >= 40 else '#ff7f0e', '#f0f0f0']
|
| 169 |
+
|
| 170 |
+
wedges, texts, autotexts = ax4.pie(sizes, labels=['Covered', 'Not Covered'],
|
| 171 |
+
autopct='%1.1f%%',
|
| 172 |
+
colors=colors,
|
| 173 |
+
startangle=90,
|
| 174 |
+
counterclock=False)
|
| 175 |
+
|
| 176 |
+
# Add center text
|
| 177 |
+
ax4.text(0, 0, f'{overall_coverage_pct:.1f}%\nCoverage',
|
| 178 |
+
ha='center', va='center', fontsize=14, fontweight='bold')
|
| 179 |
+
|
| 180 |
+
ax4.set_title(f'Overall Coverage Performance\n{"✅ Target Met" if overall_coverage_pct >= 40 else "❌ Below Target"}',
|
| 181 |
+
fontweight='bold')
|
| 182 |
+
|
| 183 |
+
plt.tight_layout()
|
| 184 |
+
|
| 185 |
+
# Save chart
|
| 186 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 187 |
+
chart_filename = f"coverage_analysis_charts_{timestamp}.png"
|
| 188 |
+
|
| 189 |
+
results_dir = Path(__file__).parent / "results"
|
| 190 |
+
results_dir.mkdir(exist_ok=True)
|
| 191 |
+
chart_path = results_dir / chart_filename
|
| 192 |
+
|
| 193 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
| 194 |
+
plt.close()
|
| 195 |
+
|
| 196 |
+
print(f"📈 Coverage charts saved to: {chart_path}")
|
| 197 |
+
return str(chart_path)
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"❌ Coverage chart generation failed: {e}")
|
| 201 |
+
return ""
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
if __name__ == "__main__":
|
| 205 |
+
"""Independent coverage chart generation"""
|
| 206 |
+
|
| 207 |
+
print("📈 OnCall.ai Coverage Chart Generator")
|
| 208 |
+
|
| 209 |
+
chart_gen = CoverageChartGenerator()
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
stats = chart_gen.load_latest_coverage_statistics()
|
| 213 |
+
chart_path = chart_gen.generate_coverage_charts(stats)
|
| 214 |
+
|
| 215 |
+
print(f"\n✅ Coverage chart generation complete!")
|
| 216 |
+
print(f"📈 Charts saved to: {chart_path}")
|
| 217 |
+
|
| 218 |
+
except FileNotFoundError as e:
|
| 219 |
+
print(f"❌ {e}")
|
| 220 |
+
print("💡 Please run latency_evaluator.py first to generate coverage statistics data")
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric5_6_judge_evaluator_manual.md
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Metric 5-6 LLM Judge Evaluator Manual
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The `metric5_6_llm_judge_evaluator.py` is a multi-system evaluation tool that uses Llama3-70B as a third-party judge to assess medical advice quality across different AI systems. It supports both single-system evaluation and multi-system comparison with a single LLM call for maximum consistency.
|
| 6 |
+
|
| 7 |
+
## Metrics Evaluated
|
| 8 |
+
|
| 9 |
+
**Metric 5: Clinical Actionability (臨床可操作性)**
|
| 10 |
+
- Scale: 1-10 (normalized to 0.0-1.0)
|
| 11 |
+
- Question: "Can healthcare providers immediately act on this advice?"
|
| 12 |
+
- Target: ≥7.0/10 for acceptable actionability
|
| 13 |
+
|
| 14 |
+
**Metric 6: Clinical Evidence Quality (臨床證據品質)**
|
| 15 |
+
- Scale: 1-10 (normalized to 0.0-1.0)
|
| 16 |
+
- Question: "Is the advice evidence-based and follows medical standards?"
|
| 17 |
+
- Target: ≥7.5/10 for acceptable evidence quality
|
| 18 |
+
|
| 19 |
+
## System Architecture
|
| 20 |
+
|
| 21 |
+
### Multi-System Support
|
| 22 |
+
The evaluator supports flexible system combinations:
|
| 23 |
+
- **Single System**: `rag` or `direct`
|
| 24 |
+
- **Two-System Comparison**: `rag,direct`
|
| 25 |
+
- **Future Extension**: `rag,direct,claude,gpt4` (any combination)
|
| 26 |
+
|
| 27 |
+
### Judge LLM
|
| 28 |
+
- **Model**: Llama3-70B-Instruct via Hugging Face API
|
| 29 |
+
- **Strategy**: Single batch call for all evaluations
|
| 30 |
+
- **Temperature**: 0.1 (low for consistent evaluation)
|
| 31 |
+
- **Max Tokens**: 2048 (sufficient for evaluation responses)
|
| 32 |
+
|
| 33 |
+
## Prerequisites
|
| 34 |
+
|
| 35 |
+
### 1. Environment Setup
|
| 36 |
+
```bash
|
| 37 |
+
# Ensure HF_TOKEN is set in your environment
|
| 38 |
+
export HF_TOKEN="your_huggingface_token"
|
| 39 |
+
|
| 40 |
+
# Or add to .env file
|
| 41 |
+
echo "HF_TOKEN=your_huggingface_token" >> .env
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
### 2. Required Data Files
|
| 45 |
+
Before running the judge evaluator, you must have medical outputs from your systems:
|
| 46 |
+
|
| 47 |
+
**For RAG System**:
|
| 48 |
+
```bash
|
| 49 |
+
python latency_evaluator.py single_test_query.txt
|
| 50 |
+
# Generates: results/medical_outputs_YYYYMMDD_HHMMSS.json
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
**For Direct LLM System**:
|
| 54 |
+
```bash
|
| 55 |
+
python direct_llm_evaluator.py single_test_query.txt
|
| 56 |
+
# Generates: results/medical_outputs_direct_YYYYMMDD_HHMMSS.json
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## Usage
|
| 60 |
+
|
| 61 |
+
### Command Line Interface
|
| 62 |
+
|
| 63 |
+
#### Single System Evaluation
|
| 64 |
+
```bash
|
| 65 |
+
# Evaluate RAG system only
|
| 66 |
+
python metric5_6_llm_judge_evaluator.py rag
|
| 67 |
+
|
| 68 |
+
# Evaluate Direct LLM system only
|
| 69 |
+
python metric5_6_llm_judge_evaluator.py direct
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
#### Multi-System Comparison (Recommended)
|
| 73 |
+
```bash
|
| 74 |
+
# Compare RAG vs Direct systems
|
| 75 |
+
python metric5_6_llm_judge_evaluator.py rag,direct
|
| 76 |
+
|
| 77 |
+
# Future: Compare multiple systems
|
| 78 |
+
python metric5_6_llm_judge_evaluator.py rag,direct,claude
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### Complete Workflow Example
|
| 82 |
+
|
| 83 |
+
```bash
|
| 84 |
+
# Step 1: Navigate to evaluation directory
|
| 85 |
+
cd /path/to/GenAI-OnCallAssistant/evaluation
|
| 86 |
+
|
| 87 |
+
# Step 2: Generate medical outputs from both systems
|
| 88 |
+
python latency_evaluator.py single_test_query.txt
|
| 89 |
+
python direct_llm_evaluator.py single_test_query.txt
|
| 90 |
+
|
| 91 |
+
# Step 3: Run comparative evaluation
|
| 92 |
+
python metric5_6_llm_judge_evaluator.py rag,direct
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
## Output Files
|
| 96 |
+
|
| 97 |
+
### Generated Files
|
| 98 |
+
- **Statistics**: `results/judge_evaluation_comparison_rag_vs_direct_YYYYMMDD_HHMMSS.json`
|
| 99 |
+
- **Detailed Results**: Stored in evaluator's internal results array
|
| 100 |
+
|
| 101 |
+
### File Structure
|
| 102 |
+
```json
|
| 103 |
+
{
|
| 104 |
+
"comparison_metadata": {
|
| 105 |
+
"systems_compared": ["rag", "direct"],
|
| 106 |
+
"comparison_type": "multi_system",
|
| 107 |
+
"timestamp": "2025-08-04T22:00:00"
|
| 108 |
+
},
|
| 109 |
+
"category_results": {
|
| 110 |
+
"diagnosis": {
|
| 111 |
+
"average_actionability": 0.850,
|
| 112 |
+
"average_evidence": 0.780,
|
| 113 |
+
"query_count": 1,
|
| 114 |
+
"actionability_target_met": true,
|
| 115 |
+
"evidence_target_met": true
|
| 116 |
+
}
|
| 117 |
+
},
|
| 118 |
+
"overall_results": {
|
| 119 |
+
"average_actionability": 0.850,
|
| 120 |
+
"average_evidence": 0.780,
|
| 121 |
+
"successful_evaluations": 2,
|
| 122 |
+
"total_queries": 2,
|
| 123 |
+
"actionability_target_met": true,
|
| 124 |
+
"evidence_target_met": true
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## Evaluation Process
|
| 130 |
+
|
| 131 |
+
### 1. File Discovery
|
| 132 |
+
The evaluator automatically finds the latest medical output files:
|
| 133 |
+
- **RAG**: `medical_outputs_*.json`
|
| 134 |
+
- **Direct**: `medical_outputs_direct_*.json`
|
| 135 |
+
- **Custom**: `medical_outputs_{system}_*.json`
|
| 136 |
+
|
| 137 |
+
### 2. Prompt Generation
|
| 138 |
+
For multi-system comparison, the evaluator creates a structured prompt:
|
| 139 |
+
```
|
| 140 |
+
You are a medical expert evaluating and comparing AI systems...
|
| 141 |
+
|
| 142 |
+
SYSTEM 1 (RAG): Uses medical guidelines + LLM for evidence-based advice
|
| 143 |
+
SYSTEM 2 (Direct): Uses LLM only without external guidelines
|
| 144 |
+
|
| 145 |
+
QUERY 1 (DIAGNOSIS):
|
| 146 |
+
Patient Query: 60-year-old patient with hypertension history...
|
| 147 |
+
|
| 148 |
+
SYSTEM 1 Response: For a 60-year-old patient with...
|
| 149 |
+
SYSTEM 2 Response: Based on the symptoms described...
|
| 150 |
+
|
| 151 |
+
RESPONSE FORMAT:
|
| 152 |
+
Query 1 System 1: Actionability=X, Evidence=Y
|
| 153 |
+
Query 1 System 2: Actionability=X, Evidence=Y
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
### 3. LLM Judge Evaluation
|
| 157 |
+
- **Single API Call**: All systems evaluated in one request for consistency
|
| 158 |
+
- **Response Parsing**: Automatic extraction of numerical scores
|
| 159 |
+
- **Error Handling**: Graceful handling of parsing failures
|
| 160 |
+
|
| 161 |
+
### 4. Results Analysis
|
| 162 |
+
- **System-Specific Statistics**: Individual performance metrics
|
| 163 |
+
- **Comparative Analysis**: Direct system-to-system comparison
|
| 164 |
+
- **Target Compliance**: Automatic threshold checking
|
| 165 |
+
|
| 166 |
+
## Expected Output
|
| 167 |
+
|
| 168 |
+
### Console Output Example
|
| 169 |
+
```
|
| 170 |
+
🧠 OnCall.ai LLM Judge Evaluator - Metrics 5-6 Multi-System Evaluation
|
| 171 |
+
|
| 172 |
+
🧪 Multi-System Comparison: RAG vs DIRECT
|
| 173 |
+
📊 Found rag outputs: results/medical_outputs_20250804_215917.json
|
| 174 |
+
📊 Found direct outputs: results/medical_outputs_direct_20250804_220000.json
|
| 175 |
+
📊 Comparing 2 systems with 1 queries each
|
| 176 |
+
🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)
|
| 177 |
+
⚡ Strategy: Single comparison call for maximum consistency
|
| 178 |
+
|
| 179 |
+
🧠 Multi-system comparison: rag, direct
|
| 180 |
+
📊 Evaluating 1 queries across 2 systems...
|
| 181 |
+
📝 Comparison prompt created (2150 characters)
|
| 182 |
+
🔄 Calling judge LLM for multi-system comparison...
|
| 183 |
+
✅ Judge LLM completed comparison evaluation in 45.3s
|
| 184 |
+
📄 Response length: 145 characters
|
| 185 |
+
📊 RAG: 1 evaluations parsed
|
| 186 |
+
📊 DIRECT: 1 evaluations parsed
|
| 187 |
+
|
| 188 |
+
📊 === LLM JUDGE EVALUATION SUMMARY ===
|
| 189 |
+
Systems Compared: RAG vs DIRECT
|
| 190 |
+
Overall Performance:
|
| 191 |
+
Average Actionability: 0.850 (8.5/10)
|
| 192 |
+
Average Evidence Quality: 0.780 (7.8/10)
|
| 193 |
+
Actionability Target (≥7.0): ✅ Met
|
| 194 |
+
Evidence Target (≥7.5): ✅ Met
|
| 195 |
+
|
| 196 |
+
System Breakdown:
|
| 197 |
+
RAG: Actionability=0.900, Evidence=0.850 [1 queries]
|
| 198 |
+
DIRECT: Actionability=0.800, Evidence=0.710 [1 queries]
|
| 199 |
+
|
| 200 |
+
✅ LLM judge evaluation complete!
|
| 201 |
+
📊 Statistics: results/judge_evaluation_comparison_rag_vs_direct_20250804_220000.json
|
| 202 |
+
⚡ Efficiency: 2 evaluations in 1 LLM call
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
## Key Features
|
| 206 |
+
|
| 207 |
+
### 1. Scientific Comparison Design
|
| 208 |
+
- **Single Judge Call**: All systems evaluated simultaneously for consistency
|
| 209 |
+
- **Eliminates Temporal Bias**: Same judge, same context, same standards
|
| 210 |
+
- **Direct System Comparison**: Side-by-side evaluation format
|
| 211 |
+
|
| 212 |
+
### 2. Flexible Architecture
|
| 213 |
+
- **Backward Compatible**: Single system evaluation still supported
|
| 214 |
+
- **Future Extensible**: Easy to add new systems (`claude`, `gpt4`, etc.)
|
| 215 |
+
- **Modular Design**: Clean separation of concerns
|
| 216 |
+
|
| 217 |
+
### 3. Robust Error Handling
|
| 218 |
+
- **File Validation**: Automatic detection of missing input files
|
| 219 |
+
- **Query Count Verification**: Warns if systems have different query counts
|
| 220 |
+
- **Graceful Degradation**: Continues operation despite partial failures
|
| 221 |
+
|
| 222 |
+
### 4. Comprehensive Reporting
|
| 223 |
+
- **System-Specific Metrics**: Individual performance analysis
|
| 224 |
+
- **Comparative Statistics**: Direct system-to-system comparison
|
| 225 |
+
- **Target Compliance**: Automatic benchmark checking
|
| 226 |
+
- **Detailed Metadata**: Full traceability of evaluation parameters
|
| 227 |
+
|
| 228 |
+
## Troubleshooting
|
| 229 |
+
|
| 230 |
+
### Common Issues
|
| 231 |
+
|
| 232 |
+
#### 1. Missing Input Files
|
| 233 |
+
```
|
| 234 |
+
❌ No medical outputs files found for rag system
|
| 235 |
+
💡 Please run evaluators first:
|
| 236 |
+
python latency_evaluator.py single_test_query.txt
|
| 237 |
+
```
|
| 238 |
+
**Solution**: Run the prerequisite evaluators to generate medical outputs.
|
| 239 |
+
|
| 240 |
+
#### 2. HF_TOKEN Not Set
|
| 241 |
+
```
|
| 242 |
+
❌ HF_TOKEN is missing from environment variables
|
| 243 |
+
```
|
| 244 |
+
**Solution**: Set your Hugging Face token in environment or `.env` file.
|
| 245 |
+
|
| 246 |
+
#### 3. Query Count Mismatch
|
| 247 |
+
```
|
| 248 |
+
⚠️ Warning: Systems have different query counts: {'rag': 3, 'direct': 1}
|
| 249 |
+
```
|
| 250 |
+
**Solution**: Ensure both systems processed the same input file.
|
| 251 |
+
|
| 252 |
+
#### 4. LLM API Timeout
|
| 253 |
+
```
|
| 254 |
+
❌ Multi-system evaluation failed: timeout
|
| 255 |
+
```
|
| 256 |
+
**Solution**: Check internet connection and Hugging Face API status.
|
| 257 |
+
|
| 258 |
+
### Debug Tips
|
| 259 |
+
|
| 260 |
+
1. **Check File Existence**: Verify medical output files in `results/` directory
|
| 261 |
+
2. **Validate JSON Format**: Ensure input files are properly formatted
|
| 262 |
+
3. **Monitor API Usage**: Check Hugging Face account limits
|
| 263 |
+
4. **Review Logs**: Examine detailed logging output for specific errors
|
| 264 |
+
|
| 265 |
+
## Future Extensions
|
| 266 |
+
|
| 267 |
+
### Phase 2: Generic Multi-System Framework
|
| 268 |
+
```bash
|
| 269 |
+
# Configuration-driven system comparison
|
| 270 |
+
python metric5_6_llm_judge_evaluator.py --config comparison_config.json
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
### Phase 3: Unlimited System Support
|
| 274 |
+
```bash
|
| 275 |
+
# Dynamic system registration
|
| 276 |
+
python metric5_6_llm_judge_evaluator.py med42,claude,gpt4,palm,llama2
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
### Integration with Chart Generators
|
| 280 |
+
```bash
|
| 281 |
+
# Generate comparison visualizations
|
| 282 |
+
python metric5_6_llm_judge_chart_generator.py rag,direct
|
| 283 |
+
```
|
| 284 |
+
|
| 285 |
+
## Best Practices
|
| 286 |
+
|
| 287 |
+
1. **Consistent Test Data**: Use the same query file for all systems
|
| 288 |
+
2. **Sequential Execution**: Complete data collection before evaluation
|
| 289 |
+
3. **Batch Processing**: Use multi-system mode for scientific comparison
|
| 290 |
+
4. **Result Verification**: Review detailed statistics files for accuracy
|
| 291 |
+
5. **Performance Monitoring**: Track evaluation latency and API costs
|
| 292 |
+
|
| 293 |
+
## Scientific Validity
|
| 294 |
+
|
| 295 |
+
The multi-system comparison approach provides superior scientific validity compared to separate evaluations:
|
| 296 |
+
|
| 297 |
+
- **Eliminates Judge Variability**: Same judge evaluates all systems
|
| 298 |
+
- **Reduces Temporal Effects**: All evaluations in single time window
|
| 299 |
+
- **Ensures Consistent Standards**: Identical evaluation criteria applied
|
| 300 |
+
- **Enables Direct Comparison**: Side-by-side system assessment
|
| 301 |
+
- **Maximizes Efficiency**: Single API call vs multiple separate calls
|
| 302 |
+
|
| 303 |
+
This design makes the evaluation results more reliable for research publications and system optimization decisions.
|
evaluation/metric5_6_llm_judge_chart_generator.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - LLM Judge Chart Generator (Metrics 5-6)
|
| 4 |
+
==========================================================
|
| 5 |
+
|
| 6 |
+
Generates comprehensive comparison charts for LLM judge evaluation results.
|
| 7 |
+
Supports both single-system and multi-system visualization with professional layouts.
|
| 8 |
+
|
| 9 |
+
Metrics visualized:
|
| 10 |
+
5. Clinical Actionability (臨床可操作性) - 1-10 scale
|
| 11 |
+
6. Clinical Evidence Quality (臨床證據品質) - 1-10 scale
|
| 12 |
+
|
| 13 |
+
Author: YanBo Chen
|
| 14 |
+
Date: 2025-08-04
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
from typing import Dict, List, Any, Tuple
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
import glob
|
| 24 |
+
import numpy as np
|
| 25 |
+
|
| 26 |
+
# Visualization imports
|
| 27 |
+
import matplotlib.pyplot as plt
|
| 28 |
+
import seaborn as sns
|
| 29 |
+
import pandas as pd
|
| 30 |
+
from matplotlib.patches import Rectangle
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class LLMJudgeChartGenerator:
|
| 34 |
+
"""Generate professional comparison charts for LLM judge evaluation results"""
|
| 35 |
+
|
| 36 |
+
def __init__(self):
|
| 37 |
+
"""Initialize chart generator with professional styling"""
|
| 38 |
+
print("📈 Initializing LLM Judge Chart Generator...")
|
| 39 |
+
|
| 40 |
+
# Set up professional chart style
|
| 41 |
+
plt.style.use('default')
|
| 42 |
+
sns.set_palette("husl")
|
| 43 |
+
|
| 44 |
+
# Professional color scheme for medical evaluation
|
| 45 |
+
self.colors = {
|
| 46 |
+
'rag': '#2E8B57', # Sea Green - represents evidence-based
|
| 47 |
+
'direct': '#4682B4', # Steel Blue - represents direct approach
|
| 48 |
+
'claude': '#9370DB', # Medium Purple - future extension
|
| 49 |
+
'gpt4': '#DC143C', # Crimson - future extension
|
| 50 |
+
'actionability': '#FF6B6B', # Coral Red
|
| 51 |
+
'evidence': '#4ECDC4', # Turquoise
|
| 52 |
+
'target_line': '#FF4444', # Red for target thresholds
|
| 53 |
+
'grid': '#E0E0E0' # Light gray for grid
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
print("✅ Chart Generator ready with professional medical styling")
|
| 57 |
+
|
| 58 |
+
def load_latest_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 59 |
+
"""
|
| 60 |
+
Load the most recent judge evaluation statistics file
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
results_dir: Directory containing statistics files
|
| 64 |
+
"""
|
| 65 |
+
if results_dir is None:
|
| 66 |
+
results_dir = Path(__file__).parent / "results"
|
| 67 |
+
|
| 68 |
+
# Find latest comparison statistics file
|
| 69 |
+
pattern = str(results_dir / "judge_evaluation_comparison_*.json")
|
| 70 |
+
stat_files = glob.glob(pattern)
|
| 71 |
+
|
| 72 |
+
if not stat_files:
|
| 73 |
+
raise FileNotFoundError(f"No judge evaluation comparison files found in {results_dir}")
|
| 74 |
+
|
| 75 |
+
# Get the most recent file
|
| 76 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 77 |
+
|
| 78 |
+
print(f"📊 Loading statistics from: {latest_file}")
|
| 79 |
+
|
| 80 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 81 |
+
return json.load(f)
|
| 82 |
+
|
| 83 |
+
def generate_comparison_charts(self, stats: Dict[str, Any], save_path: str = None) -> str:
|
| 84 |
+
"""
|
| 85 |
+
Generate comprehensive 4-panel comparison visualization
|
| 86 |
+
|
| 87 |
+
Creates professional charts showing:
|
| 88 |
+
1. System comparison radar chart
|
| 89 |
+
2. Grouped bar chart comparison
|
| 90 |
+
3. Actionability vs Evidence scatter plot
|
| 91 |
+
4. Category-wise heatmap
|
| 92 |
+
"""
|
| 93 |
+
try:
|
| 94 |
+
# Create figure with subplots
|
| 95 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 96 |
+
fig.suptitle(
|
| 97 |
+
'Medical AI Systems Comparison - Clinical Quality Assessment\n'
|
| 98 |
+
'Actionability (1-10): Can healthcare providers act immediately? | '
|
| 99 |
+
'Evidence Quality (1-10): Is advice evidence-based?',
|
| 100 |
+
fontsize=14, fontweight='bold', y=0.95
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Extract comparison metadata
|
| 104 |
+
comparison_meta = stats.get('comparison_metadata', {})
|
| 105 |
+
systems = comparison_meta.get('systems_compared', ['rag', 'direct'])
|
| 106 |
+
|
| 107 |
+
overall_results = stats['overall_results']
|
| 108 |
+
category_results = stats['category_results']
|
| 109 |
+
|
| 110 |
+
# Chart 1: System Comparison Radar Chart
|
| 111 |
+
self._create_radar_chart(axes[0, 0], stats, systems)
|
| 112 |
+
|
| 113 |
+
# Chart 2: Grouped Bar Chart Comparison
|
| 114 |
+
self._create_grouped_bar_chart(axes[0, 1], stats, systems)
|
| 115 |
+
|
| 116 |
+
# Chart 3: Actionability vs Evidence Scatter Plot
|
| 117 |
+
self._create_scatter_plot(axes[1, 0], stats, systems)
|
| 118 |
+
|
| 119 |
+
# Chart 4: Category-wise Performance Heatmap
|
| 120 |
+
self._create_heatmap(axes[1, 1], stats, systems)
|
| 121 |
+
|
| 122 |
+
# Add method annotation at bottom
|
| 123 |
+
method_text = (
|
| 124 |
+
f"Evaluation: Llama3-70B judge | Targets: Actionability ≥7.0, Evidence ≥7.5 | "
|
| 125 |
+
f"Systems: {', '.join([s.upper() for s in systems])} | "
|
| 126 |
+
f"Queries: {overall_results.get('total_queries', 'N/A')}"
|
| 127 |
+
)
|
| 128 |
+
fig.text(0.5, 0.02, method_text, ha='center', fontsize=10,
|
| 129 |
+
style='italic', color='gray')
|
| 130 |
+
|
| 131 |
+
# Adjust layout
|
| 132 |
+
plt.tight_layout()
|
| 133 |
+
plt.subplots_adjust(top=0.88, bottom=0.08)
|
| 134 |
+
|
| 135 |
+
# Save the chart
|
| 136 |
+
if save_path is None:
|
| 137 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 138 |
+
systems_str = "_vs_".join(systems)
|
| 139 |
+
save_path = f"judge_comparison_charts_{systems_str}_{timestamp}.png"
|
| 140 |
+
|
| 141 |
+
results_dir = Path(__file__).parent / "results"
|
| 142 |
+
results_dir.mkdir(exist_ok=True)
|
| 143 |
+
full_path = results_dir / save_path
|
| 144 |
+
|
| 145 |
+
plt.savefig(full_path, dpi=300, bbox_inches='tight')
|
| 146 |
+
plt.show()
|
| 147 |
+
|
| 148 |
+
print(f"📊 Comparison charts saved to: {full_path}")
|
| 149 |
+
return str(full_path)
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"❌ Chart generation failed: {e}")
|
| 153 |
+
raise
|
| 154 |
+
|
| 155 |
+
def _create_radar_chart(self, ax, stats: Dict, systems: List[str]):
|
| 156 |
+
"""Create radar chart for multi-dimensional system comparison"""
|
| 157 |
+
ax.set_title('Multi-Dimensional System Comparison', fontweight='bold', pad=20)
|
| 158 |
+
|
| 159 |
+
# Prepare data for radar chart using real system-specific data
|
| 160 |
+
categories = ['Overall Actionability', 'Overall Evidence', 'Diagnosis', 'Treatment', 'Mixed']
|
| 161 |
+
|
| 162 |
+
# Extract real system-specific metrics
|
| 163 |
+
detailed_results = stats.get('detailed_system_results', {})
|
| 164 |
+
system_data = {}
|
| 165 |
+
|
| 166 |
+
for system in systems:
|
| 167 |
+
if system in detailed_results:
|
| 168 |
+
system_info = detailed_results[system]
|
| 169 |
+
system_results = system_info['results']
|
| 170 |
+
|
| 171 |
+
# Calculate category-specific performance
|
| 172 |
+
category_performance = {}
|
| 173 |
+
for result in system_results:
|
| 174 |
+
category = result.get('category', 'unknown').lower()
|
| 175 |
+
if category not in category_performance:
|
| 176 |
+
category_performance[category] = {'actionability': [], 'evidence': []}
|
| 177 |
+
category_performance[category]['actionability'].append(result['actionability_score'])
|
| 178 |
+
category_performance[category]['evidence'].append(result['evidence_score'])
|
| 179 |
+
|
| 180 |
+
# Build radar chart data
|
| 181 |
+
system_scores = [
|
| 182 |
+
system_info['avg_actionability'], # Overall Actionability
|
| 183 |
+
system_info['avg_evidence'], # Overall Evidence
|
| 184 |
+
# Category-specific scores (average of actionability and evidence)
|
| 185 |
+
(sum(category_performance.get('diagnosis', {}).get('actionability', [0])) /
|
| 186 |
+
len(category_performance.get('diagnosis', {}).get('actionability', [1])) +
|
| 187 |
+
sum(category_performance.get('diagnosis', {}).get('evidence', [0])) /
|
| 188 |
+
len(category_performance.get('diagnosis', {}).get('evidence', [1]))) / 2 if 'diagnosis' in category_performance else 0.5,
|
| 189 |
+
|
| 190 |
+
(sum(category_performance.get('treatment', {}).get('actionability', [0])) /
|
| 191 |
+
len(category_performance.get('treatment', {}).get('actionability', [1])) +
|
| 192 |
+
sum(category_performance.get('treatment', {}).get('evidence', [0])) /
|
| 193 |
+
len(category_performance.get('treatment', {}).get('evidence', [1]))) / 2 if 'treatment' in category_performance else 0.5,
|
| 194 |
+
|
| 195 |
+
(sum(category_performance.get('mixed', {}).get('actionability', [0])) /
|
| 196 |
+
len(category_performance.get('mixed', {}).get('actionability', [1])) +
|
| 197 |
+
sum(category_performance.get('mixed', {}).get('evidence', [0])) /
|
| 198 |
+
len(category_performance.get('mixed', {}).get('evidence', [1]))) / 2 if 'mixed' in category_performance else 0.5
|
| 199 |
+
]
|
| 200 |
+
system_data[system] = system_scores
|
| 201 |
+
else:
|
| 202 |
+
# Fallback to overall stats if detailed results not available
|
| 203 |
+
overall_results = stats['overall_results']
|
| 204 |
+
system_data[system] = [
|
| 205 |
+
overall_results['average_actionability'],
|
| 206 |
+
overall_results['average_evidence'],
|
| 207 |
+
0.7, 0.6, 0.5 # Placeholder for missing category data
|
| 208 |
+
]
|
| 209 |
+
|
| 210 |
+
# Create radar chart
|
| 211 |
+
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
|
| 212 |
+
angles += angles[:1] # Complete the circle
|
| 213 |
+
|
| 214 |
+
for system in systems:
|
| 215 |
+
values = system_data[system] + [system_data[system][0]] # Complete the circle
|
| 216 |
+
ax.plot(angles, values, 'o-', linewidth=2,
|
| 217 |
+
label=f'{system.upper()} System', color=self.colors.get(system, 'gray'))
|
| 218 |
+
ax.fill(angles, values, alpha=0.1, color=self.colors.get(system, 'gray'))
|
| 219 |
+
|
| 220 |
+
# Customize radar chart
|
| 221 |
+
ax.set_xticks(angles[:-1])
|
| 222 |
+
ax.set_xticklabels(categories, fontsize=9)
|
| 223 |
+
ax.set_ylim(0, 1)
|
| 224 |
+
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
|
| 225 |
+
ax.set_yticklabels(['2.0', '4.0', '6.0', '8.0', '10.0'])
|
| 226 |
+
ax.grid(True, alpha=0.3)
|
| 227 |
+
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0))
|
| 228 |
+
|
| 229 |
+
# Add target threshold circle
|
| 230 |
+
target_circle = [0.7] * (len(categories) + 1) # 7.0 threshold
|
| 231 |
+
ax.plot(angles, target_circle, '--', color=self.colors['target_line'],
|
| 232 |
+
alpha=0.7, label='Target (7.0)')
|
| 233 |
+
|
| 234 |
+
def _create_grouped_bar_chart(self, ax, stats: Dict, systems: List[str]):
|
| 235 |
+
"""Create grouped bar chart for direct metric comparison"""
|
| 236 |
+
ax.set_title('Direct Metric Comparison', fontweight='bold', pad=20)
|
| 237 |
+
|
| 238 |
+
# Prepare data using real system-specific metrics
|
| 239 |
+
metrics = ['Actionability', 'Evidence Quality']
|
| 240 |
+
detailed_results = stats.get('detailed_system_results', {})
|
| 241 |
+
|
| 242 |
+
# Extract real system-specific data
|
| 243 |
+
system_scores = {}
|
| 244 |
+
for system in systems:
|
| 245 |
+
if system in detailed_results:
|
| 246 |
+
system_info = detailed_results[system]
|
| 247 |
+
system_scores[system] = [
|
| 248 |
+
system_info['avg_actionability'],
|
| 249 |
+
system_info['avg_evidence']
|
| 250 |
+
]
|
| 251 |
+
else:
|
| 252 |
+
# Fallback to overall results
|
| 253 |
+
overall_results = stats['overall_results']
|
| 254 |
+
system_scores[system] = [
|
| 255 |
+
overall_results['average_actionability'],
|
| 256 |
+
overall_results['average_evidence']
|
| 257 |
+
]
|
| 258 |
+
|
| 259 |
+
# Create grouped bar chart
|
| 260 |
+
x = np.arange(len(metrics))
|
| 261 |
+
width = 0.35 if len(systems) == 2 else 0.25
|
| 262 |
+
|
| 263 |
+
for i, system in enumerate(systems):
|
| 264 |
+
offset = (i - len(systems)/2 + 0.5) * width
|
| 265 |
+
bars = ax.bar(x + offset, system_scores[system], width,
|
| 266 |
+
label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
|
| 267 |
+
alpha=0.8)
|
| 268 |
+
|
| 269 |
+
# Add value labels on bars
|
| 270 |
+
for bar, value in zip(bars, system_scores[system]):
|
| 271 |
+
height = bar.get_height()
|
| 272 |
+
ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 273 |
+
f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 274 |
+
|
| 275 |
+
# Add target threshold lines
|
| 276 |
+
ax.axhline(y=0.7, color=self.colors['target_line'], linestyle='--',
|
| 277 |
+
alpha=0.7, label='Actionability Target (7.0)')
|
| 278 |
+
ax.axhline(y=0.75, color=self.colors['target_line'], linestyle=':',
|
| 279 |
+
alpha=0.7, label='Evidence Target (7.5)')
|
| 280 |
+
|
| 281 |
+
# Customize chart
|
| 282 |
+
ax.set_xlabel('Evaluation Metrics')
|
| 283 |
+
ax.set_ylabel('Score (0-1 scale)')
|
| 284 |
+
ax.set_title('System Performance Comparison')
|
| 285 |
+
ax.set_xticks(x)
|
| 286 |
+
ax.set_xticklabels(metrics)
|
| 287 |
+
ax.legend(loc='upper left')
|
| 288 |
+
ax.grid(True, alpha=0.3, axis='y')
|
| 289 |
+
ax.set_ylim(0, 1.0)
|
| 290 |
+
|
| 291 |
+
def _create_scatter_plot(self, ax, stats: Dict, systems: List[str]):
|
| 292 |
+
"""Create scatter plot for actionability vs evidence quality analysis"""
|
| 293 |
+
ax.set_title('Actionability vs Evidence Quality Analysis', fontweight='bold', pad=20)
|
| 294 |
+
|
| 295 |
+
# Extract real query-level data from detailed results
|
| 296 |
+
detailed_results = stats.get('detailed_system_results', {})
|
| 297 |
+
|
| 298 |
+
for system in systems:
|
| 299 |
+
if system in detailed_results:
|
| 300 |
+
system_results = detailed_results[system]['results']
|
| 301 |
+
|
| 302 |
+
# Extract real actionability and evidence scores for each query
|
| 303 |
+
actionability_scores = [r['actionability_score'] for r in system_results]
|
| 304 |
+
evidence_scores = [r['evidence_score'] for r in system_results]
|
| 305 |
+
|
| 306 |
+
ax.scatter(actionability_scores, evidence_scores,
|
| 307 |
+
label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
|
| 308 |
+
alpha=0.7, s=100, edgecolors='white', linewidth=1)
|
| 309 |
+
else:
|
| 310 |
+
# Fallback: create single point from overall averages
|
| 311 |
+
overall_results = stats['overall_results']
|
| 312 |
+
ax.scatter([overall_results['average_actionability']],
|
| 313 |
+
[overall_results['average_evidence']],
|
| 314 |
+
label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
|
| 315 |
+
alpha=0.7, s=100, edgecolors='white', linewidth=1)
|
| 316 |
+
|
| 317 |
+
# Add target threshold lines
|
| 318 |
+
ax.axvline(x=0.7, color=self.colors['target_line'], linestyle='--',
|
| 319 |
+
alpha=0.7, label='Actionability Target')
|
| 320 |
+
ax.axhline(y=0.75, color=self.colors['target_line'], linestyle='--',
|
| 321 |
+
alpha=0.7, label='Evidence Target')
|
| 322 |
+
|
| 323 |
+
# Add target zone
|
| 324 |
+
target_rect = Rectangle((0.7, 0.75), 0.3, 0.25, linewidth=1,
|
| 325 |
+
edgecolor=self.colors['target_line'], facecolor='green',
|
| 326 |
+
alpha=0.1, label='Target Zone')
|
| 327 |
+
ax.add_patch(target_rect)
|
| 328 |
+
|
| 329 |
+
# Customize chart
|
| 330 |
+
ax.set_xlabel('Clinical Actionability (0-1 scale)')
|
| 331 |
+
ax.set_ylabel('Clinical Evidence Quality (0-1 scale)')
|
| 332 |
+
ax.legend(loc='lower right')
|
| 333 |
+
ax.grid(True, alpha=0.3)
|
| 334 |
+
ax.set_xlim(0, 1)
|
| 335 |
+
ax.set_ylim(0, 1)
|
| 336 |
+
|
| 337 |
+
def _create_heatmap(self, ax, stats: Dict, systems: List[str]):
|
| 338 |
+
"""Create heatmap for category-wise performance matrix"""
|
| 339 |
+
ax.set_title('Category-wise Performance Matrix', fontweight='bold', pad=20)
|
| 340 |
+
|
| 341 |
+
# Prepare data
|
| 342 |
+
categories = ['Diagnosis', 'Treatment', 'Mixed']
|
| 343 |
+
metrics = ['Actionability', 'Evidence']
|
| 344 |
+
category_results = stats['category_results']
|
| 345 |
+
|
| 346 |
+
# Create data matrix
|
| 347 |
+
data_matrix = []
|
| 348 |
+
row_labels = []
|
| 349 |
+
|
| 350 |
+
for system in systems:
|
| 351 |
+
for metric in metrics:
|
| 352 |
+
row_data = []
|
| 353 |
+
for category in categories:
|
| 354 |
+
cat_key = category.lower()
|
| 355 |
+
if cat_key in category_results and category_results[cat_key]['query_count'] > 0:
|
| 356 |
+
if metric == 'Actionability':
|
| 357 |
+
value = category_results[cat_key]['average_actionability']
|
| 358 |
+
else:
|
| 359 |
+
value = category_results[cat_key]['average_evidence']
|
| 360 |
+
else:
|
| 361 |
+
value = 0.5 # Placeholder for missing data
|
| 362 |
+
row_data.append(value)
|
| 363 |
+
|
| 364 |
+
data_matrix.append(row_data)
|
| 365 |
+
row_labels.append(f'{system.upper()}\n{metric}')
|
| 366 |
+
|
| 367 |
+
# Create heatmap
|
| 368 |
+
im = ax.imshow(data_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
|
| 369 |
+
|
| 370 |
+
# Set ticks and labels
|
| 371 |
+
ax.set_xticks(np.arange(len(categories)))
|
| 372 |
+
ax.set_yticks(np.arange(len(row_labels)))
|
| 373 |
+
ax.set_xticklabels(categories)
|
| 374 |
+
ax.set_yticklabels(row_labels, fontsize=9)
|
| 375 |
+
|
| 376 |
+
# Add text annotations
|
| 377 |
+
for i in range(len(row_labels)):
|
| 378 |
+
for j in range(len(categories)):
|
| 379 |
+
text = ax.text(j, i, f'{data_matrix[i][j]:.3f}',
|
| 380 |
+
ha='center', va='center', fontweight='bold',
|
| 381 |
+
color='white' if data_matrix[i][j] < 0.5 else 'black')
|
| 382 |
+
|
| 383 |
+
# Add colorbar
|
| 384 |
+
cbar = plt.colorbar(im, ax=ax, shrink=0.6)
|
| 385 |
+
cbar.set_label('Performance Score (0-1)', rotation=270, labelpad=15)
|
| 386 |
+
|
| 387 |
+
ax.set_xlabel('Query Categories')
|
| 388 |
+
ax.set_ylabel('System × Metric')
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
# Independent execution interface
|
| 392 |
+
if __name__ == "__main__":
|
| 393 |
+
"""Independent chart generation interface"""
|
| 394 |
+
|
| 395 |
+
print("📊 OnCall.ai LLM Judge Chart Generator - Metrics 5-6 Visualization")
|
| 396 |
+
|
| 397 |
+
# Initialize generator
|
| 398 |
+
generator = LLMJudgeChartGenerator()
|
| 399 |
+
|
| 400 |
+
try:
|
| 401 |
+
# Load latest statistics
|
| 402 |
+
stats = generator.load_latest_statistics()
|
| 403 |
+
|
| 404 |
+
print(f"📈 Generating comparison charts...")
|
| 405 |
+
|
| 406 |
+
# Generate comprehensive comparison charts
|
| 407 |
+
chart_path = generator.generate_comparison_charts(stats)
|
| 408 |
+
|
| 409 |
+
# Print summary
|
| 410 |
+
comparison_meta = stats.get('comparison_metadata', {})
|
| 411 |
+
systems = comparison_meta.get('systems_compared', ['rag', 'direct'])
|
| 412 |
+
overall_results = stats['overall_results']
|
| 413 |
+
|
| 414 |
+
print(f"\n📊 === CHART GENERATION SUMMARY ===")
|
| 415 |
+
print(f"Systems Visualized: {' vs '.join([s.upper() for s in systems])}")
|
| 416 |
+
print(f"Overall Actionability: {overall_results['average_actionability']:.3f}")
|
| 417 |
+
print(f"Overall Evidence Quality: {overall_results['average_evidence']:.3f}")
|
| 418 |
+
print(f"Total Queries: {overall_results['total_queries']}")
|
| 419 |
+
print(f"Chart Components: Radar Chart, Bar Chart, Scatter Plot, Heatmap")
|
| 420 |
+
|
| 421 |
+
print(f"\n✅ Comprehensive visualization complete!")
|
| 422 |
+
print(f"📊 Charts saved to: {chart_path}")
|
| 423 |
+
print(f"💡 Tip: Charts optimized for research presentations and publications")
|
| 424 |
+
|
| 425 |
+
except FileNotFoundError as e:
|
| 426 |
+
print(f"❌ {e}")
|
| 427 |
+
print(f"💡 Please run judge evaluation first:")
|
| 428 |
+
print(" python metric5_6_llm_judge_evaluator.py rag,direct")
|
| 429 |
+
except Exception as e:
|
| 430 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric5_6_llm_judge_evaluator.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - LLM Judge Evaluator (Metrics 5-6)
|
| 4 |
+
====================================================
|
| 5 |
+
|
| 6 |
+
Uses Llama3-70B as third-party judge to evaluate medical advice quality.
|
| 7 |
+
Batch evaluation strategy: 1 call evaluates all queries for maximum efficiency.
|
| 8 |
+
|
| 9 |
+
Metrics evaluated:
|
| 10 |
+
5. Clinical Actionability (臨床可操作性)
|
| 11 |
+
6. Clinical Evidence Quality (臨床證據品質)
|
| 12 |
+
|
| 13 |
+
EVALUATION RUBRICS:
|
| 14 |
+
|
| 15 |
+
Metric 5: Clinical Actionability (1-10 scale)
|
| 16 |
+
1-2 points: Almost no actionable advice; extremely abstract or empty responses.
|
| 17 |
+
3-4 points: Provides some directional suggestions but too vague, lacks clear steps.
|
| 18 |
+
5-6 points: Offers basic executable steps but lacks details or insufficient explanation for key aspects.
|
| 19 |
+
7-8 points: Clear and complete steps that clinicians can follow, with occasional gaps needing supplementation.
|
| 20 |
+
9-10 points: Extremely actionable with precise, step-by-step executable guidance; can be used "as-is" immediately.
|
| 21 |
+
|
| 22 |
+
Metric 6: Clinical Evidence Quality (1-10 scale)
|
| 23 |
+
1-2 points: Almost no evidence support; cites completely irrelevant or unreliable sources.
|
| 24 |
+
3-4 points: References lower quality literature or guidelines, or sources lack authority.
|
| 25 |
+
5-6 points: Uses general quality literature/guidelines but lacks depth or currency.
|
| 26 |
+
7-8 points: References reliable, authoritative sources (renowned journals or authoritative guidelines) with accurate explanations.
|
| 27 |
+
9-10 points: Rich and high-quality evidence sources (systematic reviews, RCTs, etc.) combined with latest research; enhances recommendation credibility.
|
| 28 |
+
|
| 29 |
+
Author: YanBo Chen
|
| 30 |
+
Date: 2025-08-04
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
import json
|
| 34 |
+
import os
|
| 35 |
+
import sys
|
| 36 |
+
import time
|
| 37 |
+
from typing import Dict, List, Any, Tuple
|
| 38 |
+
from datetime import datetime
|
| 39 |
+
from pathlib import Path
|
| 40 |
+
import glob
|
| 41 |
+
import re
|
| 42 |
+
|
| 43 |
+
# Evaluation Rubrics as programmable constants
|
| 44 |
+
ACTIONABILITY_RUBRIC = {
|
| 45 |
+
(1, 2): "Almost no actionable advice; extremely abstract or empty responses.",
|
| 46 |
+
(3, 4): "Provides some directional suggestions but too vague, lacks clear steps.",
|
| 47 |
+
(5, 6): "Offers basic executable steps but lacks details or insufficient explanation for key aspects.",
|
| 48 |
+
(7, 8): "Clear and complete steps that clinicians can follow, with occasional gaps needing supplementation.",
|
| 49 |
+
(9, 10): "Extremely actionable with precise, step-by-step executable guidance; can be used 'as-is' immediately."
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
EVIDENCE_RUBRIC = {
|
| 53 |
+
(1, 2): "Almost no evidence support; cites completely irrelevant or unreliable sources.",
|
| 54 |
+
(3, 4): "References lower quality literature or guidelines, or sources lack authority.",
|
| 55 |
+
(5, 6): "Uses general quality literature/guidelines but lacks depth or currency.",
|
| 56 |
+
(7, 8): "References reliable, authoritative sources (renowned journals or authoritative guidelines) with accurate explanations.",
|
| 57 |
+
(9, 10): "Rich and high-quality evidence sources (systematic reviews, RCTs, etc.) combined with latest research; enhances recommendation credibility."
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
def print_evaluation_rubrics():
|
| 61 |
+
"""Print detailed evaluation rubrics for reference"""
|
| 62 |
+
print("=" * 60)
|
| 63 |
+
print("CLINICAL EVALUATION RUBRICS")
|
| 64 |
+
print("=" * 60)
|
| 65 |
+
|
| 66 |
+
print("\n🎯 METRIC 5: Clinical Actionability (1-10 scale)")
|
| 67 |
+
print("-" * 50)
|
| 68 |
+
for score_range, description in ACTIONABILITY_RUBRIC.items():
|
| 69 |
+
print(f"{score_range[0]}–{score_range[1]} points: {description}")
|
| 70 |
+
|
| 71 |
+
print("\n📚 METRIC 6: Clinical Evidence Quality (1-10 scale)")
|
| 72 |
+
print("-" * 50)
|
| 73 |
+
for score_range, description in EVIDENCE_RUBRIC.items():
|
| 74 |
+
print(f"{score_range[0]}–{score_range[1]} points: {description}")
|
| 75 |
+
|
| 76 |
+
print("\n" + "=" * 60)
|
| 77 |
+
print("TARGET THRESHOLDS:")
|
| 78 |
+
print("• Actionability: ≥7.0 (Acceptable clinical utility)")
|
| 79 |
+
print("• Evidence Quality: ≥7.5 (Reliable evidence support)")
|
| 80 |
+
print("=" * 60)
|
| 81 |
+
|
| 82 |
+
def get_rubric_description(score: int, metric_type: str) -> str:
|
| 83 |
+
"""Get rubric description for a given score and metric type"""
|
| 84 |
+
rubric = ACTIONABILITY_RUBRIC if metric_type == "actionability" else EVIDENCE_RUBRIC
|
| 85 |
+
|
| 86 |
+
for score_range, description in rubric.items():
|
| 87 |
+
if score_range[0] <= score <= score_range[1]:
|
| 88 |
+
return description
|
| 89 |
+
|
| 90 |
+
return "Score out of valid range (1-10)"
|
| 91 |
+
|
| 92 |
+
# Add project path
|
| 93 |
+
current_dir = Path(__file__).parent
|
| 94 |
+
project_root = current_dir.parent
|
| 95 |
+
src_dir = project_root / "src"
|
| 96 |
+
sys.path.insert(0, str(src_dir))
|
| 97 |
+
|
| 98 |
+
# Import LLM client for judge evaluation
|
| 99 |
+
try:
|
| 100 |
+
from llm_clients import llm_Llama3_70B_JudgeClient
|
| 101 |
+
except ImportError as e:
|
| 102 |
+
print(f"❌ Import failed: {e}")
|
| 103 |
+
print("Please ensure running from project root directory")
|
| 104 |
+
sys.exit(1)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class LLMJudgeEvaluator:
|
| 108 |
+
"""LLM judge evaluator using batch evaluation strategy"""
|
| 109 |
+
|
| 110 |
+
def __init__(self):
|
| 111 |
+
"""Initialize judge LLM client"""
|
| 112 |
+
print("🔧 Initializing LLM Judge Evaluator...")
|
| 113 |
+
|
| 114 |
+
# Initialize Llama3-70B as judge LLM
|
| 115 |
+
self.judge_llm = llm_Llama3_70B_JudgeClient()
|
| 116 |
+
|
| 117 |
+
self.evaluation_results = []
|
| 118 |
+
|
| 119 |
+
print("✅ LLM Judge Evaluator initialization complete")
|
| 120 |
+
|
| 121 |
+
def load_medical_outputs(self, filepath: str) -> List[Dict[str, Any]]:
|
| 122 |
+
"""Load medical outputs from file"""
|
| 123 |
+
print(f"📁 Loading medical outputs from: {filepath}")
|
| 124 |
+
|
| 125 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 126 |
+
data = json.load(f)
|
| 127 |
+
|
| 128 |
+
medical_outputs = data.get('medical_outputs', [])
|
| 129 |
+
print(f"📋 Loaded {len(medical_outputs)} medical outputs")
|
| 130 |
+
|
| 131 |
+
return medical_outputs
|
| 132 |
+
|
| 133 |
+
def find_medical_outputs_for_systems(self, systems: List[str]) -> Dict[str, str]:
|
| 134 |
+
"""Find medical outputs files for multiple systems"""
|
| 135 |
+
results_dir = Path(__file__).parent / "results"
|
| 136 |
+
system_files = {}
|
| 137 |
+
|
| 138 |
+
for system in systems:
|
| 139 |
+
if system == "rag":
|
| 140 |
+
# Use more specific pattern to exclude direct files
|
| 141 |
+
pattern = str(results_dir / "medical_outputs_[0-9]*.json")
|
| 142 |
+
elif system == "direct":
|
| 143 |
+
pattern = str(results_dir / "medical_outputs_direct_*.json")
|
| 144 |
+
else:
|
| 145 |
+
# Future extension: support other systems
|
| 146 |
+
pattern = str(results_dir / f"medical_outputs_{system}_*.json")
|
| 147 |
+
|
| 148 |
+
print(f"🔍 Searching for {system} with pattern: {pattern}")
|
| 149 |
+
output_files = glob.glob(pattern)
|
| 150 |
+
print(f"🔍 Found files for {system}: {output_files}")
|
| 151 |
+
|
| 152 |
+
if not output_files:
|
| 153 |
+
raise FileNotFoundError(f"No medical outputs files found for {system} system")
|
| 154 |
+
|
| 155 |
+
latest_file = max(output_files, key=os.path.getmtime)
|
| 156 |
+
system_files[system] = latest_file
|
| 157 |
+
print(f"📊 Found {system} outputs: {latest_file}")
|
| 158 |
+
|
| 159 |
+
return system_files
|
| 160 |
+
|
| 161 |
+
def create_comparison_evaluation_prompt(self, systems_outputs: Dict[str, List[Dict]]) -> str:
|
| 162 |
+
"""
|
| 163 |
+
Create comparison evaluation prompt for multiple systems
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
systems_outputs: Dict mapping system names to their medical outputs
|
| 167 |
+
"""
|
| 168 |
+
system_names = list(systems_outputs.keys())
|
| 169 |
+
|
| 170 |
+
prompt_parts = [
|
| 171 |
+
"You are a medical expert evaluating and comparing AI systems for clinical advice quality.",
|
| 172 |
+
f"Please evaluate {len(system_names)} different systems using the detailed rubrics below:",
|
| 173 |
+
"",
|
| 174 |
+
"EVALUATION RUBRICS:",
|
| 175 |
+
"",
|
| 176 |
+
"METRIC 1: Clinical Actionability (1-10 scale)",
|
| 177 |
+
"Question: Can healthcare providers immediately act on this advice?",
|
| 178 |
+
"1-2 points: Almost no actionable advice; extremely abstract or empty responses.",
|
| 179 |
+
"3-4 points: Provides directional suggestions but too vague, lacks clear steps.",
|
| 180 |
+
"5-6 points: Offers basic executable steps but lacks details for key aspects.",
|
| 181 |
+
"7-8 points: Clear and complete steps that clinicians can follow with occasional gaps.",
|
| 182 |
+
"9-10 points: Extremely actionable with precise, step-by-step executable guidance.",
|
| 183 |
+
"",
|
| 184 |
+
"METRIC 2: Clinical Evidence Quality (1-10 scale)",
|
| 185 |
+
"Question: Is the advice evidence-based and follows medical standards?",
|
| 186 |
+
"1-2 points: Almost no evidence support; cites irrelevant or unreliable sources.",
|
| 187 |
+
"3-4 points: References lower quality literature or sources lack authority.",
|
| 188 |
+
"5-6 points: Uses general quality literature/guidelines but lacks depth or currency.",
|
| 189 |
+
"7-8 points: References reliable, authoritative sources with accurate explanations.",
|
| 190 |
+
"9-10 points: Rich, high-quality evidence sources combined with latest research.",
|
| 191 |
+
"",
|
| 192 |
+
"TARGET THRESHOLDS: Actionability ≥7.0, Evidence Quality ≥7.5",
|
| 193 |
+
""
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
# Add system descriptions
|
| 197 |
+
for i, system in enumerate(system_names, 1):
|
| 198 |
+
if system == "rag":
|
| 199 |
+
prompt_parts.append(f"SYSTEM {i} (RAG): Uses medical guidelines + LLM for evidence-based advice")
|
| 200 |
+
elif system == "direct":
|
| 201 |
+
prompt_parts.append(f"SYSTEM {i} (Direct): Uses LLM only without external guidelines")
|
| 202 |
+
else:
|
| 203 |
+
prompt_parts.append(f"SYSTEM {i} ({system.upper()}): {system} medical AI system")
|
| 204 |
+
|
| 205 |
+
prompt_parts.extend([
|
| 206 |
+
"",
|
| 207 |
+
"EVALUATION CRITERIA:",
|
| 208 |
+
"1. Clinical Actionability (1-10): Can healthcare providers immediately act on this advice?",
|
| 209 |
+
"2. Clinical Evidence Quality (1-10): Is the advice evidence-based and follows medical standards?",
|
| 210 |
+
"",
|
| 211 |
+
"QUERIES TO EVALUATE:",
|
| 212 |
+
""
|
| 213 |
+
])
|
| 214 |
+
|
| 215 |
+
# Get all queries (assuming all systems processed same queries)
|
| 216 |
+
first_system = system_names[0]
|
| 217 |
+
queries = systems_outputs[first_system]
|
| 218 |
+
|
| 219 |
+
# Add each query with all system responses
|
| 220 |
+
for i, query_data in enumerate(queries, 1):
|
| 221 |
+
query = query_data.get('query', '')
|
| 222 |
+
category = query_data.get('category', 'unknown')
|
| 223 |
+
|
| 224 |
+
prompt_parts.extend([
|
| 225 |
+
f"=== QUERY {i} ({category.upper()}) ===",
|
| 226 |
+
f"Patient Query: {query}",
|
| 227 |
+
""
|
| 228 |
+
])
|
| 229 |
+
|
| 230 |
+
# Add each system's response
|
| 231 |
+
for j, system in enumerate(system_names, 1):
|
| 232 |
+
system_query = systems_outputs[system][i-1] # Get corresponding query from this system
|
| 233 |
+
advice = system_query.get('medical_advice', '')
|
| 234 |
+
|
| 235 |
+
prompt_parts.extend([
|
| 236 |
+
f"SYSTEM {j} Response: {advice}",
|
| 237 |
+
""
|
| 238 |
+
])
|
| 239 |
+
|
| 240 |
+
prompt_parts.extend([
|
| 241 |
+
"RESPONSE FORMAT (provide exactly this format):",
|
| 242 |
+
""
|
| 243 |
+
])
|
| 244 |
+
|
| 245 |
+
# Add response format template
|
| 246 |
+
for i in range(1, len(queries) + 1):
|
| 247 |
+
for j, system in enumerate(system_names, 1):
|
| 248 |
+
prompt_parts.append(f"Query {i} System {j}: Actionability=X, Evidence=Y")
|
| 249 |
+
|
| 250 |
+
prompt_parts.extend([
|
| 251 |
+
"",
|
| 252 |
+
"Replace X and Y with numeric scores 1-10.",
|
| 253 |
+
"Provide only the scores in the exact format above.",
|
| 254 |
+
f"Note: System 1={system_names[0]}, System 2={system_names[1] if len(system_names) > 1 else 'N/A'}"
|
| 255 |
+
])
|
| 256 |
+
|
| 257 |
+
return "\n".join(prompt_parts)
|
| 258 |
+
|
| 259 |
+
def parse_comparison_evaluation_response(self, response: str, systems_outputs: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
|
| 260 |
+
"""Parse comparison evaluation response into results by system"""
|
| 261 |
+
results_by_system = {}
|
| 262 |
+
system_names = list(systems_outputs.keys())
|
| 263 |
+
|
| 264 |
+
# Initialize results for each system
|
| 265 |
+
for system in system_names:
|
| 266 |
+
results_by_system[system] = []
|
| 267 |
+
|
| 268 |
+
lines = response.strip().split('\n')
|
| 269 |
+
|
| 270 |
+
for line in lines:
|
| 271 |
+
line = line.strip()
|
| 272 |
+
if not line:
|
| 273 |
+
continue
|
| 274 |
+
|
| 275 |
+
# Parse format: "Query X System Y: Actionability=A, Evidence=B"
|
| 276 |
+
match = re.match(r'Query\s+(\d+)\s+System\s+(\d+):\s*Actionability\s*=\s*(\d+)\s*,\s*Evidence\s*=\s*(\d+)', line, re.IGNORECASE)
|
| 277 |
+
|
| 278 |
+
if match:
|
| 279 |
+
query_num = int(match.group(1)) - 1 # 0-based index
|
| 280 |
+
system_num = int(match.group(2)) - 1 # 0-based index
|
| 281 |
+
actionability_score = int(match.group(3))
|
| 282 |
+
evidence_score = int(match.group(4))
|
| 283 |
+
|
| 284 |
+
if system_num < len(system_names) and query_num < len(systems_outputs[system_names[system_num]]):
|
| 285 |
+
system_name = system_names[system_num]
|
| 286 |
+
output = systems_outputs[system_name][query_num]
|
| 287 |
+
|
| 288 |
+
result = {
|
| 289 |
+
"query": output.get('query', ''),
|
| 290 |
+
"category": output.get('category', 'unknown'),
|
| 291 |
+
"system_type": system_name,
|
| 292 |
+
"medical_advice": output.get('medical_advice', ''),
|
| 293 |
+
|
| 294 |
+
# Metric 5: Clinical Actionability
|
| 295 |
+
"actionability_score": actionability_score / 10.0,
|
| 296 |
+
"actionability_raw": actionability_score,
|
| 297 |
+
|
| 298 |
+
# Metric 6: Clinical Evidence Quality
|
| 299 |
+
"evidence_score": evidence_score / 10.0,
|
| 300 |
+
"evidence_raw": evidence_score,
|
| 301 |
+
|
| 302 |
+
"evaluation_success": True,
|
| 303 |
+
"timestamp": datetime.now().isoformat()
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
results_by_system[system_name].append(result)
|
| 307 |
+
|
| 308 |
+
return results_by_system
|
| 309 |
+
|
| 310 |
+
def evaluate_multiple_systems(self, systems_outputs: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
|
| 311 |
+
"""
|
| 312 |
+
Evaluate multiple systems using single LLM call for comparison
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
systems_outputs: Dict mapping system names to their medical outputs
|
| 316 |
+
"""
|
| 317 |
+
system_names = list(systems_outputs.keys())
|
| 318 |
+
total_queries = len(systems_outputs[system_names[0]])
|
| 319 |
+
|
| 320 |
+
print(f"🧠 Multi-system comparison: {', '.join(system_names)}")
|
| 321 |
+
print(f"📊 Evaluating {total_queries} queries across {len(system_names)} systems...")
|
| 322 |
+
|
| 323 |
+
try:
|
| 324 |
+
# Create comparison evaluation prompt
|
| 325 |
+
comparison_prompt = self.create_comparison_evaluation_prompt(systems_outputs)
|
| 326 |
+
|
| 327 |
+
print(f"📝 Comparison prompt created ({len(comparison_prompt)} characters)")
|
| 328 |
+
print(f"🔄 Calling judge LLM for multi-system comparison...")
|
| 329 |
+
|
| 330 |
+
# Single LLM call for all systems comparison
|
| 331 |
+
eval_start = time.time()
|
| 332 |
+
response = self.judge_llm.batch_evaluate(comparison_prompt)
|
| 333 |
+
eval_time = time.time() - eval_start
|
| 334 |
+
|
| 335 |
+
# Extract response text
|
| 336 |
+
response_text = response.get('content', '') if isinstance(response, dict) else str(response)
|
| 337 |
+
|
| 338 |
+
print(f"✅ Judge LLM completed comparison evaluation in {eval_time:.2f}s")
|
| 339 |
+
print(f"📄 Response length: {len(response_text)} characters")
|
| 340 |
+
|
| 341 |
+
# Parse comparison response
|
| 342 |
+
results_by_system = self.parse_comparison_evaluation_response(response_text, systems_outputs)
|
| 343 |
+
|
| 344 |
+
# Combine all results for storage
|
| 345 |
+
all_results = []
|
| 346 |
+
for system_name, system_results in results_by_system.items():
|
| 347 |
+
all_results.extend(system_results)
|
| 348 |
+
print(f"📊 {system_name.upper()}: {len(system_results)} evaluations parsed")
|
| 349 |
+
|
| 350 |
+
self.evaluation_results.extend(all_results)
|
| 351 |
+
|
| 352 |
+
return results_by_system
|
| 353 |
+
|
| 354 |
+
except Exception as e:
|
| 355 |
+
print(f"❌ Multi-system evaluation failed: {e}")
|
| 356 |
+
|
| 357 |
+
# Create error results for all systems
|
| 358 |
+
error_results = {}
|
| 359 |
+
for system_name, outputs in systems_outputs.items():
|
| 360 |
+
error_results[system_name] = []
|
| 361 |
+
for output in outputs:
|
| 362 |
+
error_result = {
|
| 363 |
+
"query": output.get('query', ''),
|
| 364 |
+
"category": output.get('category', 'unknown'),
|
| 365 |
+
"system_type": system_name,
|
| 366 |
+
"actionability_score": 0.0,
|
| 367 |
+
"evidence_score": 0.0,
|
| 368 |
+
"evaluation_success": False,
|
| 369 |
+
"error": str(e),
|
| 370 |
+
"timestamp": datetime.now().isoformat()
|
| 371 |
+
}
|
| 372 |
+
error_results[system_name].append(error_result)
|
| 373 |
+
self.evaluation_results.extend(error_results[system_name])
|
| 374 |
+
|
| 375 |
+
return error_results
|
| 376 |
+
|
| 377 |
+
def calculate_judge_statistics(self) -> Dict[str, Any]:
|
| 378 |
+
"""Calculate statistics for LLM judge evaluation"""
|
| 379 |
+
successful_results = [r for r in self.evaluation_results if r.get('evaluation_success')]
|
| 380 |
+
|
| 381 |
+
if not successful_results:
|
| 382 |
+
return {
|
| 383 |
+
"category_results": {},
|
| 384 |
+
"overall_results": {
|
| 385 |
+
"average_actionability": 0.0,
|
| 386 |
+
"average_evidence": 0.0,
|
| 387 |
+
"successful_evaluations": 0,
|
| 388 |
+
"total_queries": len(self.evaluation_results)
|
| 389 |
+
},
|
| 390 |
+
"timestamp": datetime.now().isoformat()
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
# Group by category
|
| 394 |
+
results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
|
| 395 |
+
|
| 396 |
+
for result in successful_results:
|
| 397 |
+
category = result.get('category', 'unknown')
|
| 398 |
+
if category in results_by_category:
|
| 399 |
+
results_by_category[category].append(result)
|
| 400 |
+
|
| 401 |
+
# Calculate category statistics
|
| 402 |
+
category_stats = {}
|
| 403 |
+
for category, results in results_by_category.items():
|
| 404 |
+
if results:
|
| 405 |
+
actionability_scores = [r['actionability_score'] for r in results]
|
| 406 |
+
evidence_scores = [r['evidence_score'] for r in results]
|
| 407 |
+
|
| 408 |
+
category_stats[category] = {
|
| 409 |
+
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
| 410 |
+
"average_evidence": sum(evidence_scores) / len(evidence_scores),
|
| 411 |
+
"query_count": len(results),
|
| 412 |
+
"actionability_target_met": (sum(actionability_scores) / len(actionability_scores)) >= 0.7,
|
| 413 |
+
"evidence_target_met": (sum(evidence_scores) / len(evidence_scores)) >= 0.75,
|
| 414 |
+
"individual_actionability_scores": actionability_scores,
|
| 415 |
+
"individual_evidence_scores": evidence_scores
|
| 416 |
+
}
|
| 417 |
+
else:
|
| 418 |
+
category_stats[category] = {
|
| 419 |
+
"average_actionability": 0.0,
|
| 420 |
+
"average_evidence": 0.0,
|
| 421 |
+
"query_count": 0,
|
| 422 |
+
"actionability_target_met": False,
|
| 423 |
+
"evidence_target_met": False,
|
| 424 |
+
"individual_actionability_scores": [],
|
| 425 |
+
"individual_evidence_scores": []
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
# Calculate overall statistics
|
| 429 |
+
all_actionability = [r['actionability_score'] for r in successful_results]
|
| 430 |
+
all_evidence = [r['evidence_score'] for r in successful_results]
|
| 431 |
+
|
| 432 |
+
overall_stats = {
|
| 433 |
+
"average_actionability": sum(all_actionability) / len(all_actionability),
|
| 434 |
+
"average_evidence": sum(all_evidence) / len(all_evidence),
|
| 435 |
+
"successful_evaluations": len(successful_results),
|
| 436 |
+
"total_queries": len(self.evaluation_results),
|
| 437 |
+
"actionability_target_met": (sum(all_actionability) / len(all_actionability)) >= 0.7,
|
| 438 |
+
"evidence_target_met": (sum(all_evidence) / len(all_evidence)) >= 0.75
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
return {
|
| 442 |
+
"category_results": category_stats,
|
| 443 |
+
"overall_results": overall_stats,
|
| 444 |
+
"timestamp": datetime.now().isoformat()
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
def save_comparison_statistics(self, systems: List[str], filename: str = None) -> str:
|
| 448 |
+
"""Save comparison evaluation statistics for multiple systems"""
|
| 449 |
+
stats = self.calculate_judge_statistics()
|
| 450 |
+
|
| 451 |
+
if filename is None:
|
| 452 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 453 |
+
systems_str = "_vs_".join(systems)
|
| 454 |
+
filename = f"judge_evaluation_comparison_{systems_str}_{timestamp}.json"
|
| 455 |
+
|
| 456 |
+
results_dir = Path(__file__).parent / "results"
|
| 457 |
+
results_dir.mkdir(exist_ok=True)
|
| 458 |
+
filepath = results_dir / filename
|
| 459 |
+
|
| 460 |
+
# Add comparison metadata
|
| 461 |
+
stats["comparison_metadata"] = {
|
| 462 |
+
"systems_compared": systems,
|
| 463 |
+
"comparison_type": "multi_system",
|
| 464 |
+
"timestamp": datetime.now().isoformat()
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
# Add detailed system-specific results for chart generation
|
| 468 |
+
stats["detailed_system_results"] = {}
|
| 469 |
+
for system in systems:
|
| 470 |
+
system_results = [r for r in self.evaluation_results if r.get('system_type') == system and r.get('evaluation_success')]
|
| 471 |
+
stats["detailed_system_results"][system] = {
|
| 472 |
+
"results": system_results,
|
| 473 |
+
"query_count": len(system_results),
|
| 474 |
+
"avg_actionability": sum(r['actionability_score'] for r in system_results) / len(system_results) if system_results else 0.0,
|
| 475 |
+
"avg_evidence": sum(r['evidence_score'] for r in system_results) / len(system_results) if system_results else 0.0
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 479 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 480 |
+
|
| 481 |
+
print(f"📊 Comparison evaluation statistics saved to: {filepath}")
|
| 482 |
+
return str(filepath)
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
# Independent execution interface
|
| 486 |
+
if __name__ == "__main__":
|
| 487 |
+
"""Independent LLM judge evaluation interface with multi-system support"""
|
| 488 |
+
|
| 489 |
+
print("🧠 OnCall.ai LLM Judge Evaluator - Metrics 5-6 Multi-System Evaluation")
|
| 490 |
+
|
| 491 |
+
# Print evaluation rubrics for reference
|
| 492 |
+
print_evaluation_rubrics()
|
| 493 |
+
|
| 494 |
+
if len(sys.argv) < 2:
|
| 495 |
+
print("Usage: python metric5_6_llm_judge_evaluator.py [system1] or [system1,system2,...]")
|
| 496 |
+
print(" rag - Evaluate RAG system medical outputs")
|
| 497 |
+
print(" direct - Evaluate direct LLM medical outputs")
|
| 498 |
+
print(" rag,direct - Compare RAG vs Direct systems")
|
| 499 |
+
print(" system1,system2,system3 - Compare multiple systems")
|
| 500 |
+
sys.exit(1)
|
| 501 |
+
|
| 502 |
+
# Parse systems from command line
|
| 503 |
+
systems_input = sys.argv[1]
|
| 504 |
+
systems = [s.strip() for s in systems_input.split(',')]
|
| 505 |
+
|
| 506 |
+
# Initialize evaluator
|
| 507 |
+
evaluator = LLMJudgeEvaluator()
|
| 508 |
+
|
| 509 |
+
try:
|
| 510 |
+
if len(systems) == 1:
|
| 511 |
+
# Single system evaluation (legacy mode)
|
| 512 |
+
system = systems[0]
|
| 513 |
+
print(f"\n🧪 Single System LLM Judge Evaluation: {system.upper()}")
|
| 514 |
+
|
| 515 |
+
# Find and load medical outputs for single system
|
| 516 |
+
system_files = evaluator.find_medical_outputs_for_systems([system])
|
| 517 |
+
medical_outputs = evaluator.load_medical_outputs(system_files[system])
|
| 518 |
+
|
| 519 |
+
if not medical_outputs:
|
| 520 |
+
print(f"❌ No medical outputs found for {system}")
|
| 521 |
+
sys.exit(1)
|
| 522 |
+
|
| 523 |
+
print(f"📊 Evaluating {len(medical_outputs)} medical advice outputs")
|
| 524 |
+
print(f"🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)")
|
| 525 |
+
|
| 526 |
+
# Convert to multi-system format for consistency
|
| 527 |
+
systems_outputs = {system: medical_outputs}
|
| 528 |
+
results_by_system = evaluator.evaluate_multiple_systems(systems_outputs)
|
| 529 |
+
|
| 530 |
+
# Save results
|
| 531 |
+
stats_path = evaluator.save_comparison_statistics([system])
|
| 532 |
+
|
| 533 |
+
else:
|
| 534 |
+
# Multi-system comparison evaluation
|
| 535 |
+
print(f"\n🧪 Multi-System Comparison: {' vs '.join([s.upper() for s in systems])}")
|
| 536 |
+
|
| 537 |
+
# Find and load medical outputs for all systems
|
| 538 |
+
system_files = evaluator.find_medical_outputs_for_systems(systems)
|
| 539 |
+
systems_outputs = {}
|
| 540 |
+
|
| 541 |
+
for system in systems:
|
| 542 |
+
outputs = evaluator.load_medical_outputs(system_files[system])
|
| 543 |
+
if not outputs:
|
| 544 |
+
print(f"❌ No medical outputs found for {system}")
|
| 545 |
+
sys.exit(1)
|
| 546 |
+
systems_outputs[system] = outputs
|
| 547 |
+
|
| 548 |
+
# Validate all systems have same number of queries
|
| 549 |
+
query_counts = [len(outputs) for outputs in systems_outputs.values()]
|
| 550 |
+
if len(set(query_counts)) > 1:
|
| 551 |
+
print(f"⚠️ Warning: Systems have different query counts: {dict(zip(systems, query_counts))}")
|
| 552 |
+
|
| 553 |
+
# Validate systems processed same queries (for scientific comparison)
|
| 554 |
+
print(f"🔍 Validating query consistency across systems...")
|
| 555 |
+
if len(systems) > 1:
|
| 556 |
+
first_system_queries = [q['query'] for q in systems_outputs[systems[0]]]
|
| 557 |
+
for i, system in enumerate(systems[1:], 1):
|
| 558 |
+
system_queries = [q['query'] for q in systems_outputs[system]]
|
| 559 |
+
|
| 560 |
+
if first_system_queries != system_queries:
|
| 561 |
+
print(f"⚠️ Warning: {systems[0]} and {system} processed different queries!")
|
| 562 |
+
# Show first difference
|
| 563 |
+
for j, (q1, q2) in enumerate(zip(first_system_queries, system_queries)):
|
| 564 |
+
if q1 != q2:
|
| 565 |
+
print(f" Query {j+1} differs:")
|
| 566 |
+
print(f" {systems[0]}: {q1[:50]}...")
|
| 567 |
+
print(f" {system}: {q2[:50]}...")
|
| 568 |
+
break
|
| 569 |
+
else:
|
| 570 |
+
print(f"✅ {systems[0]} and {system} processed identical queries")
|
| 571 |
+
|
| 572 |
+
# Validate systems have different model types
|
| 573 |
+
model_types = set()
|
| 574 |
+
for system, outputs in systems_outputs.items():
|
| 575 |
+
if outputs:
|
| 576 |
+
model_type = outputs[0].get('model_type', 'unknown')
|
| 577 |
+
model_types.add(model_type)
|
| 578 |
+
print(f"🏷️ {system.upper()} system model_type: {model_type}")
|
| 579 |
+
|
| 580 |
+
if len(model_types) == 1:
|
| 581 |
+
print(f"⚠️ Warning: All systems have same model_type - this may not be a valid comparison!")
|
| 582 |
+
else:
|
| 583 |
+
print(f"✅ Systems have different model_types: {model_types}")
|
| 584 |
+
|
| 585 |
+
print(f"📊 Comparing {len(systems)} systems with {min(query_counts)} queries each")
|
| 586 |
+
print(f"🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)")
|
| 587 |
+
print(f"⚡ Strategy: Single comparison call for maximum consistency")
|
| 588 |
+
|
| 589 |
+
# Multi-system comparison evaluation
|
| 590 |
+
results_by_system = evaluator.evaluate_multiple_systems(systems_outputs)
|
| 591 |
+
|
| 592 |
+
# Save comparison results
|
| 593 |
+
stats_path = evaluator.save_comparison_statistics(systems)
|
| 594 |
+
|
| 595 |
+
# Print summary
|
| 596 |
+
print(f"\n📊 Generating evaluation analysis...")
|
| 597 |
+
stats = evaluator.calculate_judge_statistics()
|
| 598 |
+
overall_results = stats['overall_results']
|
| 599 |
+
|
| 600 |
+
print(f"\n📊 === LLM JUDGE EVALUATION SUMMARY ===")
|
| 601 |
+
|
| 602 |
+
if len(systems) == 1:
|
| 603 |
+
print(f"System: {systems[0].upper()}")
|
| 604 |
+
else:
|
| 605 |
+
print(f"Systems Compared: {' vs '.join([s.upper() for s in systems])}")
|
| 606 |
+
|
| 607 |
+
print(f"Overall Performance:")
|
| 608 |
+
actionability_raw = overall_results['average_actionability'] * 10
|
| 609 |
+
evidence_raw = overall_results['average_evidence'] * 10
|
| 610 |
+
|
| 611 |
+
print(f" Average Actionability: {overall_results['average_actionability']:.3f} ({actionability_raw:.1f}/10)")
|
| 612 |
+
print(f" • {get_rubric_description(int(actionability_raw), 'actionability')}")
|
| 613 |
+
print(f" Average Evidence Quality: {overall_results['average_evidence']:.3f} ({evidence_raw:.1f}/10)")
|
| 614 |
+
print(f" • {get_rubric_description(int(evidence_raw), 'evidence')}")
|
| 615 |
+
print(f" Actionability Target (≥7.0): {'✅ Met' if overall_results['actionability_target_met'] else '❌ Not Met'}")
|
| 616 |
+
print(f" Evidence Target (≥7.5): {'✅ Met' if overall_results['evidence_target_met'] else '❌ Not Met'}")
|
| 617 |
+
|
| 618 |
+
# System-specific breakdown for multi-system comparison
|
| 619 |
+
if len(systems) > 1:
|
| 620 |
+
print(f"\nSystem Breakdown:")
|
| 621 |
+
for system in systems:
|
| 622 |
+
system_results = [r for r in evaluator.evaluation_results if r.get('system_type') == system and r.get('evaluation_success')]
|
| 623 |
+
if system_results:
|
| 624 |
+
avg_action = sum(r['actionability_score'] for r in system_results) / len(system_results)
|
| 625 |
+
avg_evidence = sum(r['evidence_score'] for r in system_results) / len(system_results)
|
| 626 |
+
print(f" {system.upper()}: Actionability={avg_action:.3f}, Evidence={avg_evidence:.3f} [{len(system_results)} queries]")
|
| 627 |
+
|
| 628 |
+
print(f"\n✅ LLM judge evaluation complete!")
|
| 629 |
+
print(f"📊 Statistics: {stats_path}")
|
| 630 |
+
print(f"⚡ Efficiency: {overall_results['total_queries']} evaluations in 1 LLM call")
|
| 631 |
+
|
| 632 |
+
except FileNotFoundError as e:
|
| 633 |
+
print(f"❌ {e}")
|
| 634 |
+
print(f"💡 Please run evaluators first:")
|
| 635 |
+
for system in systems:
|
| 636 |
+
if system == "rag":
|
| 637 |
+
print(" python latency_evaluator.py single_test_query.txt")
|
| 638 |
+
elif system == "direct":
|
| 639 |
+
print(" python direct_llm_evaluator.py single_test_query.txt")
|
| 640 |
+
else:
|
| 641 |
+
print(f" python {system}_evaluator.py single_test_query.txt")
|
| 642 |
+
except Exception as e:
|
| 643 |
+
print(f"❌ Judge evaluation failed: {e}")
|
evaluation/metric7_8_precision_MRR.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
|
| 4 |
+
========================================================
|
| 5 |
+
|
| 6 |
+
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
| 7 |
+
using data collected from latency_evaluator.py comprehensive evaluation.
|
| 8 |
+
|
| 9 |
+
IMPORTANT CHANGES - Angular Distance & Relevance Calculation:
|
| 10 |
+
- DISTANCE METRIC: Uses Angular Distance from Annoy index (range: 0.0-1.0, smaller = more relevant)
|
| 11 |
+
- RELEVANCE CONVERSION: relevance = 1.0 - (angular_distance²) / 2.0 (mathematical correct formula)
|
| 12 |
+
- THRESHOLD ALIGNMENT: Aligned with Metric 3 relevance calculation standards
|
| 13 |
+
- DISPLAY UPDATE: Changed from "Relevance: X" to "Angular Distance: X" for clarity
|
| 14 |
+
|
| 15 |
+
METRICS CALCULATED:
|
| 16 |
+
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
|
| 17 |
+
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
| 18 |
+
|
| 19 |
+
DESIGN PRINCIPLE:
|
| 20 |
+
- Reuses comprehensive_details_*.json from latency_evaluator.py
|
| 21 |
+
- Implements adaptive threshold based on query complexity
|
| 22 |
+
- Query complexity determined by actual matched emergency keywords count
|
| 23 |
+
- No additional LLM calls required
|
| 24 |
+
|
| 25 |
+
Author: YanBo Chen
|
| 26 |
+
Date: 2025-08-04
|
| 27 |
+
Updated: 2025-08-04 (Angular Distance alignment)
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
import json
|
| 31 |
+
import os
|
| 32 |
+
import sys
|
| 33 |
+
from typing import Dict, List, Any, Set
|
| 34 |
+
from datetime import datetime
|
| 35 |
+
from pathlib import Path
|
| 36 |
+
import re
|
| 37 |
+
import statistics
|
| 38 |
+
|
| 39 |
+
# Relevance threshold constants for adaptive query complexity handling
|
| 40 |
+
COMPLEX_QUERY_RELEVANCE_THRESHOLD = 0.65 # For queries with multiple emergency keywords
|
| 41 |
+
SIMPLE_QUERY_RELEVANCE_THRESHOLD = 0.75 # For straightforward diagnostic queries
|
| 42 |
+
|
| 43 |
+
class PrecisionMRRAnalyzer:
|
| 44 |
+
"""Specialized analyzer for metrics 7-8 using existing comprehensive evaluation data"""
|
| 45 |
+
|
| 46 |
+
def __init__(self):
|
| 47 |
+
"""Initialize analyzer"""
|
| 48 |
+
print("🔧 Initializing Precision & MRR Analyzer...")
|
| 49 |
+
self.analysis_results = []
|
| 50 |
+
print("✅ Analyzer initialization complete")
|
| 51 |
+
|
| 52 |
+
def load_comprehensive_data(self, filepath: str) -> List[Dict]:
|
| 53 |
+
"""
|
| 54 |
+
Load comprehensive evaluation data from latency_evaluator.py output
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
filepath: Path to comprehensive_details_*.json file
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List of comprehensive evaluation results
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 64 |
+
data = json.load(f)
|
| 65 |
+
|
| 66 |
+
comprehensive_results = data.get('comprehensive_results', [])
|
| 67 |
+
|
| 68 |
+
print(f"📁 Loaded {len(comprehensive_results)} comprehensive evaluation results")
|
| 69 |
+
print(f"📊 Ready for precision/MRR analysis: {sum(1 for r in comprehensive_results if r.get('precision_mrr_ready'))}")
|
| 70 |
+
|
| 71 |
+
return comprehensive_results
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"❌ Failed to load comprehensive data: {e}")
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
def _is_complex_query(self, query: str, processed_results: List[Dict]) -> bool:
|
| 78 |
+
"""
|
| 79 |
+
Determine query complexity based on actual matched emergency keywords
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
query: Original query text
|
| 83 |
+
processed_results: Retrieval results with matched keywords
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
True if query is complex (should use lenient threshold)
|
| 87 |
+
"""
|
| 88 |
+
# Collect unique emergency keywords actually found in retrieval results
|
| 89 |
+
unique_emergency_keywords = set()
|
| 90 |
+
|
| 91 |
+
for result in processed_results:
|
| 92 |
+
if result.get('type') == 'emergency':
|
| 93 |
+
matched_keywords = result.get('matched', '')
|
| 94 |
+
if matched_keywords:
|
| 95 |
+
keywords = [kw.strip() for kw in matched_keywords.split('|') if kw.strip()]
|
| 96 |
+
unique_emergency_keywords.update(keywords)
|
| 97 |
+
|
| 98 |
+
keyword_count = len(unique_emergency_keywords)
|
| 99 |
+
|
| 100 |
+
# Business logic: 4+ different emergency keywords indicate complex case
|
| 101 |
+
is_complex = keyword_count >= 4
|
| 102 |
+
|
| 103 |
+
print(f" 🧠 Query complexity: {'Complex' if is_complex else 'Simple'} ({keyword_count} emergency keywords)")
|
| 104 |
+
print(f" 🔑 Found keywords: {', '.join(list(unique_emergency_keywords)[:5])}")
|
| 105 |
+
|
| 106 |
+
return is_complex
|
| 107 |
+
|
| 108 |
+
def calculate_precision_mrr_single(self, query_data: Dict) -> Dict[str, Any]:
|
| 109 |
+
"""
|
| 110 |
+
Calculate precision@K and MRR for single query
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
query_data: Single query's comprehensive evaluation result
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
Precision and MRR metrics for this query
|
| 117 |
+
"""
|
| 118 |
+
query = query_data['query']
|
| 119 |
+
category = query_data['category']
|
| 120 |
+
|
| 121 |
+
# Extract processed results from pipeline data
|
| 122 |
+
pipeline_data = query_data.get('pipeline_data', {})
|
| 123 |
+
retrieval_results = pipeline_data.get('retrieval_results', {})
|
| 124 |
+
processed_results = retrieval_results.get('processed_results', [])
|
| 125 |
+
|
| 126 |
+
print(f"🔍 Analyzing precision/MRR for: {query[:50]}...")
|
| 127 |
+
print(f"📋 Category: {category}, Results: {len(processed_results)}")
|
| 128 |
+
|
| 129 |
+
if not processed_results:
|
| 130 |
+
return self._create_empty_precision_mrr_result(query, category)
|
| 131 |
+
|
| 132 |
+
# Step 1: Determine query complexity
|
| 133 |
+
is_complex = self._is_complex_query(query, processed_results)
|
| 134 |
+
|
| 135 |
+
# Step 2: Choose adaptive threshold (aligned with Metric 3 relevance standards)
|
| 136 |
+
threshold = COMPLEX_QUERY_RELEVANCE_THRESHOLD if is_complex else SIMPLE_QUERY_RELEVANCE_THRESHOLD # Updated thresholds for complex/simple queries
|
| 137 |
+
|
| 138 |
+
print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
| 139 |
+
|
| 140 |
+
# Step 3: Calculate relevance scores using correct angular distance formula
|
| 141 |
+
relevance_scores = []
|
| 142 |
+
for result in processed_results:
|
| 143 |
+
distance = result.get('distance', 1.0)
|
| 144 |
+
relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
|
| 145 |
+
relevance_scores.append(relevance)
|
| 146 |
+
|
| 147 |
+
# Step 4: Calculate Precision@K (aligned with Metric 3 thresholds)
|
| 148 |
+
relevant_count = sum(1 for score in relevance_scores if score >= threshold)
|
| 149 |
+
precision_at_k = relevant_count / len(processed_results)
|
| 150 |
+
|
| 151 |
+
# Step 5: Calculate MRR
|
| 152 |
+
first_relevant_rank = None
|
| 153 |
+
for i, score in enumerate(relevance_scores, 1):
|
| 154 |
+
if score >= threshold:
|
| 155 |
+
first_relevant_rank = i
|
| 156 |
+
break
|
| 157 |
+
|
| 158 |
+
mrr_score = (1.0 / first_relevant_rank) if first_relevant_rank else 0.0
|
| 159 |
+
|
| 160 |
+
# Detailed analysis
|
| 161 |
+
result = {
|
| 162 |
+
"query": query,
|
| 163 |
+
"category": category,
|
| 164 |
+
"query_complexity": "complex" if is_complex else "simple",
|
| 165 |
+
"threshold_used": threshold,
|
| 166 |
+
|
| 167 |
+
# Metric 7: Precision@K
|
| 168 |
+
"precision_at_k": precision_at_k,
|
| 169 |
+
"relevant_count": relevant_count,
|
| 170 |
+
"total_results": len(processed_results),
|
| 171 |
+
|
| 172 |
+
# Metric 8: MRR
|
| 173 |
+
"mrr_score": mrr_score,
|
| 174 |
+
"first_relevant_rank": first_relevant_rank,
|
| 175 |
+
|
| 176 |
+
# Supporting data
|
| 177 |
+
"relevance_scores": relevance_scores,
|
| 178 |
+
"avg_relevance": sum(relevance_scores) / len(relevance_scores),
|
| 179 |
+
"max_relevance": max(relevance_scores),
|
| 180 |
+
"min_relevance": min(relevance_scores),
|
| 181 |
+
|
| 182 |
+
"timestamp": datetime.now().isoformat()
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
print(f" 📊 Precision@{len(processed_results)}: {precision_at_k:.3f} ({relevant_count}/{len(processed_results)} relevant)")
|
| 186 |
+
print(f" 📊 MRR: {mrr_score:.3f} (first relevant at rank {first_relevant_rank})")
|
| 187 |
+
|
| 188 |
+
return result
|
| 189 |
+
|
| 190 |
+
def _create_empty_precision_mrr_result(self, query: str, category: str) -> Dict[str, Any]:
|
| 191 |
+
"""Create empty result for failed queries"""
|
| 192 |
+
return {
|
| 193 |
+
"query": query,
|
| 194 |
+
"category": category,
|
| 195 |
+
"query_complexity": "unknown",
|
| 196 |
+
"threshold_used": 0.0,
|
| 197 |
+
"precision_at_k": 0.0,
|
| 198 |
+
"relevant_count": 0,
|
| 199 |
+
"total_results": 0,
|
| 200 |
+
"mrr_score": 0.0,
|
| 201 |
+
"first_relevant_rank": None,
|
| 202 |
+
"relevance_scores": [],
|
| 203 |
+
"timestamp": datetime.now().isoformat()
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
def analyze_all_queries(self, comprehensive_results: List[Dict]) -> List[Dict]:
|
| 207 |
+
"""
|
| 208 |
+
Analyze precision/MRR for all queries in comprehensive evaluation
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
comprehensive_results: Results from latency_evaluator.py
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
List of precision/MRR analysis results
|
| 215 |
+
"""
|
| 216 |
+
print(f"\n📊 Analyzing Precision@K and MRR for {len(comprehensive_results)} queries...")
|
| 217 |
+
|
| 218 |
+
analysis_results = []
|
| 219 |
+
|
| 220 |
+
for i, query_data in enumerate(comprehensive_results):
|
| 221 |
+
if not query_data.get('precision_mrr_ready'):
|
| 222 |
+
print(f"⏭️ Skipping query {i+1}: Not ready for precision/MRR analysis")
|
| 223 |
+
continue
|
| 224 |
+
|
| 225 |
+
if not query_data.get('overall_success'):
|
| 226 |
+
print(f"⏭️ Skipping query {i+1}: Pipeline failed")
|
| 227 |
+
analysis_results.append(self._create_empty_precision_mrr_result(
|
| 228 |
+
query_data['query'],
|
| 229 |
+
query_data['category']
|
| 230 |
+
))
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
# Analyze this query
|
| 234 |
+
result = self.calculate_precision_mrr_single(query_data)
|
| 235 |
+
analysis_results.append(result)
|
| 236 |
+
|
| 237 |
+
print("") # Spacing between queries
|
| 238 |
+
|
| 239 |
+
self.analysis_results = analysis_results
|
| 240 |
+
return analysis_results
|
| 241 |
+
|
| 242 |
+
def calculate_statistics(self) -> Dict[str, Any]:
|
| 243 |
+
"""Calculate comprehensive statistics for metrics 7-8"""
|
| 244 |
+
|
| 245 |
+
if not self.analysis_results:
|
| 246 |
+
return {"error": "No analysis results available"}
|
| 247 |
+
|
| 248 |
+
# Separate by complexity and category
|
| 249 |
+
stats = {
|
| 250 |
+
"overall_statistics": {},
|
| 251 |
+
"by_complexity": {"simple": {}, "complex": {}},
|
| 252 |
+
"by_category": {"diagnosis": {}, "treatment": {}, "mixed": {}},
|
| 253 |
+
"timestamp": datetime.now().isoformat()
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
# Overall statistics
|
| 257 |
+
all_precision = [r['precision_at_k'] for r in self.analysis_results]
|
| 258 |
+
all_mrr = [r['mrr_score'] for r in self.analysis_results]
|
| 259 |
+
|
| 260 |
+
stats["overall_statistics"] = {
|
| 261 |
+
"total_queries": len(self.analysis_results),
|
| 262 |
+
"avg_precision": statistics.mean(all_precision),
|
| 263 |
+
"avg_mrr": statistics.mean(all_mrr),
|
| 264 |
+
"precision_std": statistics.stdev(all_precision) if len(all_precision) > 1 else 0.0,
|
| 265 |
+
"mrr_std": statistics.stdev(all_mrr) if len(all_mrr) > 1 else 0.0
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
# By complexity
|
| 269 |
+
for complexity in ["simple", "complex"]:
|
| 270 |
+
complexity_results = [r for r in self.analysis_results if r['query_complexity'] == complexity]
|
| 271 |
+
if complexity_results:
|
| 272 |
+
precision_scores = [r['precision_at_k'] for r in complexity_results]
|
| 273 |
+
mrr_scores = [r['mrr_score'] for r in complexity_results]
|
| 274 |
+
|
| 275 |
+
stats["by_complexity"][complexity] = {
|
| 276 |
+
"query_count": len(complexity_results),
|
| 277 |
+
"avg_precision": statistics.mean(precision_scores),
|
| 278 |
+
"avg_mrr": statistics.mean(mrr_scores),
|
| 279 |
+
"avg_threshold": statistics.mean([r['threshold_used'] for r in complexity_results])
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
# By category
|
| 283 |
+
for category in ["diagnosis", "treatment", "mixed"]:
|
| 284 |
+
category_results = [r for r in self.analysis_results if r['category'] == category]
|
| 285 |
+
if category_results:
|
| 286 |
+
precision_scores = [r['precision_at_k'] for r in category_results]
|
| 287 |
+
mrr_scores = [r['mrr_score'] for r in category_results]
|
| 288 |
+
|
| 289 |
+
stats["by_category"][category] = {
|
| 290 |
+
"query_count": len(category_results),
|
| 291 |
+
"avg_precision": statistics.mean(precision_scores),
|
| 292 |
+
"avg_mrr": statistics.mean(mrr_scores)
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
return stats
|
| 296 |
+
|
| 297 |
+
def save_results(self, filename: str = None) -> str:
|
| 298 |
+
"""Save precision/MRR analysis results"""
|
| 299 |
+
if filename is None:
|
| 300 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 301 |
+
filename = f"precision_mrr_analysis_{timestamp}.json"
|
| 302 |
+
|
| 303 |
+
# Ensure results directory exists
|
| 304 |
+
results_dir = Path(__file__).parent / "results"
|
| 305 |
+
results_dir.mkdir(exist_ok=True)
|
| 306 |
+
|
| 307 |
+
filepath = results_dir / filename
|
| 308 |
+
|
| 309 |
+
# Create output data
|
| 310 |
+
output_data = {
|
| 311 |
+
"analysis_metadata": {
|
| 312 |
+
"total_queries": len(self.analysis_results),
|
| 313 |
+
"analysis_type": "precision_mrr_metrics_7_8",
|
| 314 |
+
"timestamp": datetime.now().isoformat(),
|
| 315 |
+
"adaptive_threshold": True
|
| 316 |
+
},
|
| 317 |
+
"detailed_results": self.analysis_results,
|
| 318 |
+
"statistics": self.calculate_statistics()
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 322 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
| 323 |
+
|
| 324 |
+
print(f"📊 Precision/MRR analysis saved to: {filepath}")
|
| 325 |
+
return str(filepath)
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
# Independent execution interface
|
| 329 |
+
if __name__ == "__main__":
|
| 330 |
+
"""Independent precision/MRR analysis interface"""
|
| 331 |
+
|
| 332 |
+
print("📊 OnCall.ai Precision & MRR Analyzer - Metrics 7-8")
|
| 333 |
+
|
| 334 |
+
if len(sys.argv) > 1:
|
| 335 |
+
comprehensive_file = sys.argv[1]
|
| 336 |
+
else:
|
| 337 |
+
# Look for latest comprehensive_details file
|
| 338 |
+
results_dir = Path(__file__).parent / "results"
|
| 339 |
+
if results_dir.exists():
|
| 340 |
+
comprehensive_files = list(results_dir.glob("comprehensive_details_*.json"))
|
| 341 |
+
if comprehensive_files:
|
| 342 |
+
comprehensive_file = str(sorted(comprehensive_files)[-1]) # Latest file
|
| 343 |
+
print(f"📁 Using latest comprehensive file: {comprehensive_file}")
|
| 344 |
+
else:
|
| 345 |
+
print("❌ No comprehensive_details_*.json files found")
|
| 346 |
+
print("Please run latency_evaluator.py first to generate comprehensive data")
|
| 347 |
+
sys.exit(1)
|
| 348 |
+
else:
|
| 349 |
+
print("❌ Results directory not found")
|
| 350 |
+
sys.exit(1)
|
| 351 |
+
|
| 352 |
+
if not os.path.exists(comprehensive_file):
|
| 353 |
+
print(f"❌ Comprehensive file not found: {comprehensive_file}")
|
| 354 |
+
print("Usage: python precision_MRR.py [comprehensive_details_file.json]")
|
| 355 |
+
sys.exit(1)
|
| 356 |
+
|
| 357 |
+
# Initialize analyzer
|
| 358 |
+
analyzer = PrecisionMRRAnalyzer()
|
| 359 |
+
|
| 360 |
+
# Load comprehensive data from latency_evaluator.py
|
| 361 |
+
comprehensive_results = analyzer.load_comprehensive_data(comprehensive_file)
|
| 362 |
+
|
| 363 |
+
if not comprehensive_results:
|
| 364 |
+
print("❌ No comprehensive data loaded")
|
| 365 |
+
sys.exit(1)
|
| 366 |
+
|
| 367 |
+
# Analyze precision/MRR for all queries
|
| 368 |
+
analysis_results = analyzer.analyze_all_queries(comprehensive_results)
|
| 369 |
+
|
| 370 |
+
# Calculate and display statistics
|
| 371 |
+
statistics_result = analyzer.calculate_statistics()
|
| 372 |
+
|
| 373 |
+
print(f"\n📊 === PRECISION & MRR ANALYSIS SUMMARY ===")
|
| 374 |
+
|
| 375 |
+
overall_stats = statistics_result['overall_statistics']
|
| 376 |
+
print(f"\nOVERALL METRICS:")
|
| 377 |
+
print(f" Precision@K: {overall_stats['avg_precision']:.3f} (±{overall_stats['precision_std']:.3f})")
|
| 378 |
+
print(f" MRR: {overall_stats['avg_mrr']:.3f} (±{overall_stats['mrr_std']:.3f})")
|
| 379 |
+
print(f" Total Queries: {overall_stats['total_queries']}")
|
| 380 |
+
|
| 381 |
+
# Complexity-based statistics
|
| 382 |
+
complexity_stats = statistics_result['by_complexity']
|
| 383 |
+
print(f"\nBY COMPLEXITY:")
|
| 384 |
+
for complexity, stats in complexity_stats.items():
|
| 385 |
+
if stats:
|
| 386 |
+
print(f" {complexity.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
| 387 |
+
f"(threshold={stats['avg_threshold']:.2f}, n={stats['query_count']})")
|
| 388 |
+
|
| 389 |
+
# Category-based statistics
|
| 390 |
+
category_stats = statistics_result['by_category']
|
| 391 |
+
print(f"\nBY CATEGORY:")
|
| 392 |
+
for category, stats in category_stats.items():
|
| 393 |
+
if stats:
|
| 394 |
+
print(f" {category.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
| 395 |
+
f"(n={stats['query_count']})")
|
| 396 |
+
|
| 397 |
+
# Save results
|
| 398 |
+
saved_path = analyzer.save_results()
|
| 399 |
+
|
| 400 |
+
print(f"\n✅ Precision & MRR analysis complete!")
|
| 401 |
+
print(f"📁 Results saved to: {saved_path}")
|
| 402 |
+
print(f"\n💡 Next step: Create precision_mrr_chart_generator.py for visualization")
|
evaluation/metric7_8_precision_mrr_chart_generator.py
ADDED
|
@@ -0,0 +1,586 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Precision & MRR Chart Generator (Metrics 7-8)
|
| 4 |
+
===============================================================
|
| 5 |
+
|
| 6 |
+
Generates comprehensive Precision@K and MRR analysis charts from saved analysis results.
|
| 7 |
+
Reads JSON files produced by metric7_8_precision_MRR.py and creates visualizations.
|
| 8 |
+
|
| 9 |
+
Charts generated:
|
| 10 |
+
1. Precision@K comparison by category and complexity
|
| 11 |
+
2. MRR comparison by category and complexity
|
| 12 |
+
3. Combined metrics heatmap
|
| 13 |
+
4. Threshold impact analysis
|
| 14 |
+
5. Detailed statistics tables
|
| 15 |
+
|
| 16 |
+
No LLM calls - pure data visualization.
|
| 17 |
+
|
| 18 |
+
Author: YanBo Chen
|
| 19 |
+
Date: 2025-08-04
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import sys
|
| 25 |
+
from typing import Dict, List, Any
|
| 26 |
+
from datetime import datetime
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
import glob
|
| 29 |
+
|
| 30 |
+
# Visualization imports
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
import seaborn as sns
|
| 33 |
+
import pandas as pd
|
| 34 |
+
import numpy as np
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class PrecisionMRRChartGenerator:
|
| 38 |
+
"""Generate charts from precision/MRR analysis results - no LLM dependency"""
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
"""Initialize chart generator"""
|
| 42 |
+
print("📈 Initializing Precision & MRR Chart Generator...")
|
| 43 |
+
|
| 44 |
+
# Set up professional chart style
|
| 45 |
+
plt.style.use('default')
|
| 46 |
+
sns.set_palette("husl")
|
| 47 |
+
|
| 48 |
+
print("✅ Chart Generator ready")
|
| 49 |
+
|
| 50 |
+
def load_latest_analysis(self, results_dir: str = None) -> Dict[str, Any]:
|
| 51 |
+
"""
|
| 52 |
+
Load the most recent precision/MRR analysis file
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
results_dir: Directory containing analysis files
|
| 56 |
+
"""
|
| 57 |
+
if results_dir is None:
|
| 58 |
+
results_dir = Path(__file__).parent / "results"
|
| 59 |
+
|
| 60 |
+
analysis_files = glob.glob(str(results_dir / "precision_mrr_analysis_*.json"))
|
| 61 |
+
|
| 62 |
+
if not analysis_files:
|
| 63 |
+
raise FileNotFoundError("No precision_mrr_analysis_*.json files found. Run metric7_8_precision_MRR.py first.")
|
| 64 |
+
|
| 65 |
+
latest_file = max(analysis_files, key=os.path.getctime)
|
| 66 |
+
print(f"📁 Loading latest analysis: {latest_file}")
|
| 67 |
+
|
| 68 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 69 |
+
return json.load(f)
|
| 70 |
+
|
| 71 |
+
def create_precision_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 72 |
+
"""Create Precision@K comparison chart"""
|
| 73 |
+
|
| 74 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 75 |
+
|
| 76 |
+
# Chart 1: Precision by Category
|
| 77 |
+
category_stats = analysis_data['statistics']['by_category']
|
| 78 |
+
categories = []
|
| 79 |
+
precisions = []
|
| 80 |
+
|
| 81 |
+
for category, stats in category_stats.items():
|
| 82 |
+
if stats:
|
| 83 |
+
categories.append(category.title())
|
| 84 |
+
precisions.append(stats['avg_precision'])
|
| 85 |
+
|
| 86 |
+
if categories:
|
| 87 |
+
bars1 = ax1.bar(categories, precisions, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728'])
|
| 88 |
+
ax1.set_title('Precision@K by Query Category', fontweight='bold')
|
| 89 |
+
ax1.set_ylabel('Precision@K')
|
| 90 |
+
ax1.set_xlabel('Query Category')
|
| 91 |
+
ax1.set_ylim(0, 1.0)
|
| 92 |
+
ax1.grid(True, alpha=0.3)
|
| 93 |
+
|
| 94 |
+
# Add value labels
|
| 95 |
+
for bar, precision in zip(bars1, precisions):
|
| 96 |
+
height = bar.get_height()
|
| 97 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 98 |
+
f'{precision:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 99 |
+
|
| 100 |
+
# Chart 2: Precision by Complexity
|
| 101 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 102 |
+
complexities = []
|
| 103 |
+
comp_precisions = []
|
| 104 |
+
|
| 105 |
+
for complexity, stats in complexity_stats.items():
|
| 106 |
+
if stats:
|
| 107 |
+
complexities.append(complexity.title())
|
| 108 |
+
comp_precisions.append(stats['avg_precision'])
|
| 109 |
+
|
| 110 |
+
if complexities:
|
| 111 |
+
bars2 = ax2.bar(complexities, comp_precisions, alpha=0.8, color=['#2ca02c', '#d62728'])
|
| 112 |
+
ax2.set_title('Precision@K by Query Complexity', fontweight='bold')
|
| 113 |
+
ax2.set_ylabel('Precision@K')
|
| 114 |
+
ax2.set_xlabel('Query Complexity')
|
| 115 |
+
ax2.set_ylim(0, 1.0)
|
| 116 |
+
ax2.grid(True, alpha=0.3)
|
| 117 |
+
|
| 118 |
+
# Add value labels and threshold info
|
| 119 |
+
for bar, precision, complexity in zip(bars2, comp_precisions, complexities):
|
| 120 |
+
height = bar.get_height()
|
| 121 |
+
threshold = 0.15 if complexity.lower() == 'complex' else 0.25
|
| 122 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 123 |
+
f'{precision:.3f}\n(T={threshold})', ha='center', va='bottom',
|
| 124 |
+
fontweight='bold', fontsize=9)
|
| 125 |
+
|
| 126 |
+
plt.tight_layout()
|
| 127 |
+
|
| 128 |
+
# Save chart
|
| 129 |
+
if save_path is None:
|
| 130 |
+
save_path = Path(__file__).parent / "charts" / f"precision_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 131 |
+
|
| 132 |
+
save_path = Path(save_path)
|
| 133 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 134 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 135 |
+
plt.close()
|
| 136 |
+
|
| 137 |
+
print(f"📊 Precision comparison chart saved: {save_path}")
|
| 138 |
+
return str(save_path)
|
| 139 |
+
|
| 140 |
+
def create_mrr_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 141 |
+
"""Create MRR comparison chart"""
|
| 142 |
+
|
| 143 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 144 |
+
|
| 145 |
+
# Chart 1: MRR by Category
|
| 146 |
+
category_stats = analysis_data['statistics']['by_category']
|
| 147 |
+
categories = []
|
| 148 |
+
mrr_scores = []
|
| 149 |
+
|
| 150 |
+
for category, stats in category_stats.items():
|
| 151 |
+
if stats:
|
| 152 |
+
categories.append(category.title())
|
| 153 |
+
mrr_scores.append(stats['avg_mrr'])
|
| 154 |
+
|
| 155 |
+
if categories:
|
| 156 |
+
bars1 = ax1.bar(categories, mrr_scores, alpha=0.8, color=['#9467bd', '#8c564b', '#e377c2'])
|
| 157 |
+
ax1.set_title('Mean Reciprocal Rank by Query Category', fontweight='bold')
|
| 158 |
+
ax1.set_ylabel('MRR Score')
|
| 159 |
+
ax1.set_xlabel('Query Category')
|
| 160 |
+
ax1.set_ylim(0, 1.0)
|
| 161 |
+
ax1.grid(True, alpha=0.3)
|
| 162 |
+
|
| 163 |
+
# Add value labels
|
| 164 |
+
for bar, mrr in zip(bars1, mrr_scores):
|
| 165 |
+
height = bar.get_height()
|
| 166 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 167 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 168 |
+
|
| 169 |
+
# Chart 2: MRR by Complexity
|
| 170 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 171 |
+
complexities = []
|
| 172 |
+
comp_mrr = []
|
| 173 |
+
|
| 174 |
+
for complexity, stats in complexity_stats.items():
|
| 175 |
+
if stats:
|
| 176 |
+
complexities.append(complexity.title())
|
| 177 |
+
comp_mrr.append(stats['avg_mrr'])
|
| 178 |
+
|
| 179 |
+
if complexities:
|
| 180 |
+
bars2 = ax2.bar(complexities, comp_mrr, alpha=0.8, color=['#17becf', '#bcbd22'])
|
| 181 |
+
ax2.set_title('MRR by Query Complexity', fontweight='bold')
|
| 182 |
+
ax2.set_ylabel('MRR Score')
|
| 183 |
+
ax2.set_xlabel('Query Complexity')
|
| 184 |
+
ax2.set_ylim(0, 1.0)
|
| 185 |
+
ax2.grid(True, alpha=0.3)
|
| 186 |
+
|
| 187 |
+
# Add value labels
|
| 188 |
+
for bar, mrr in zip(bars2, comp_mrr):
|
| 189 |
+
height = bar.get_height()
|
| 190 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 191 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 192 |
+
|
| 193 |
+
plt.tight_layout()
|
| 194 |
+
|
| 195 |
+
# Save chart
|
| 196 |
+
if save_path is None:
|
| 197 |
+
save_path = Path(__file__).parent / "charts" / f"mrr_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 198 |
+
|
| 199 |
+
save_path = Path(save_path)
|
| 200 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 201 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 202 |
+
plt.close()
|
| 203 |
+
|
| 204 |
+
print(f"📊 MRR comparison chart saved: {save_path}")
|
| 205 |
+
return str(save_path)
|
| 206 |
+
|
| 207 |
+
def create_combined_metrics_heatmap(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 208 |
+
"""Create combined precision/MRR heatmap"""
|
| 209 |
+
|
| 210 |
+
# Prepare data for heatmap
|
| 211 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
| 212 |
+
|
| 213 |
+
if not detailed_results:
|
| 214 |
+
print("⚠️ No detailed results for heatmap")
|
| 215 |
+
return ""
|
| 216 |
+
|
| 217 |
+
# Create DataFrame for heatmap
|
| 218 |
+
heatmap_data = []
|
| 219 |
+
for result in detailed_results:
|
| 220 |
+
heatmap_data.append({
|
| 221 |
+
'Category': result['category'].title(),
|
| 222 |
+
'Complexity': result['query_complexity'].title(),
|
| 223 |
+
'Precision@K': result['precision_at_k'],
|
| 224 |
+
'MRR': result['mrr_score'],
|
| 225 |
+
'Threshold': result['threshold_used']
|
| 226 |
+
})
|
| 227 |
+
|
| 228 |
+
df = pd.DataFrame(heatmap_data)
|
| 229 |
+
|
| 230 |
+
# Create pivot table for heatmap
|
| 231 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
| 232 |
+
|
| 233 |
+
# Precision heatmap
|
| 234 |
+
precision_pivot = df.pivot_table(values='Precision@K', index='Category', columns='Complexity', aggfunc='mean')
|
| 235 |
+
sns.heatmap(precision_pivot, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax1,
|
| 236 |
+
cbar_kws={'label': 'Precision@K'}, vmin=0, vmax=1)
|
| 237 |
+
ax1.set_title('Precision@K Heatmap\n(Category vs Complexity)', fontweight='bold')
|
| 238 |
+
|
| 239 |
+
# MRR heatmap
|
| 240 |
+
mrr_pivot = df.pivot_table(values='MRR', index='Category', columns='Complexity', aggfunc='mean')
|
| 241 |
+
sns.heatmap(mrr_pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax2,
|
| 242 |
+
cbar_kws={'label': 'MRR Score'}, vmin=0, vmax=1)
|
| 243 |
+
ax2.set_title('MRR Heatmap\n(Category vs Complexity)', fontweight='bold')
|
| 244 |
+
|
| 245 |
+
plt.tight_layout()
|
| 246 |
+
|
| 247 |
+
# Save chart
|
| 248 |
+
if save_path is None:
|
| 249 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_heatmap_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 250 |
+
|
| 251 |
+
save_path = Path(save_path)
|
| 252 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 253 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 254 |
+
plt.close()
|
| 255 |
+
|
| 256 |
+
print(f"📊 Combined metrics heatmap saved: {save_path}")
|
| 257 |
+
return str(save_path)
|
| 258 |
+
|
| 259 |
+
def create_threshold_impact_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 260 |
+
"""Create threshold impact analysis chart"""
|
| 261 |
+
|
| 262 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
| 263 |
+
|
| 264 |
+
if not detailed_results:
|
| 265 |
+
print("⚠️ No detailed results for threshold analysis")
|
| 266 |
+
return ""
|
| 267 |
+
|
| 268 |
+
# Group by complexity and calculate average relevance
|
| 269 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 270 |
+
|
| 271 |
+
# Prepare data
|
| 272 |
+
simple_queries = [r for r in detailed_results if r['query_complexity'] == 'simple']
|
| 273 |
+
complex_queries = [r for r in detailed_results if r['query_complexity'] == 'complex']
|
| 274 |
+
|
| 275 |
+
# Chart 1: Relevance distribution for different complexities
|
| 276 |
+
if simple_queries:
|
| 277 |
+
simple_relevances = []
|
| 278 |
+
for query in simple_queries:
|
| 279 |
+
simple_relevances.extend(query.get('relevance_scores', []))
|
| 280 |
+
|
| 281 |
+
ax1.hist(simple_relevances, bins=10, alpha=0.7, label=f'Simple (T=0.25)', color='#2ca02c', density=True)
|
| 282 |
+
ax1.axvline(x=0.25, color='#2ca02c', linestyle='--', linewidth=2, label='Simple Threshold')
|
| 283 |
+
|
| 284 |
+
if complex_queries:
|
| 285 |
+
complex_relevances = []
|
| 286 |
+
for query in complex_queries:
|
| 287 |
+
complex_relevances.extend(query.get('relevance_scores', []))
|
| 288 |
+
|
| 289 |
+
ax1.hist(complex_relevances, bins=10, alpha=0.7, label=f'Complex (T=0.15)', color='#d62728', density=True)
|
| 290 |
+
ax1.axvline(x=0.15, color='#d62728', linestyle='--', linewidth=2, label='Complex Threshold')
|
| 291 |
+
|
| 292 |
+
ax1.set_title('Relevance Score Distribution\nby Query Complexity', fontweight='bold')
|
| 293 |
+
ax1.set_xlabel('Relevance Score')
|
| 294 |
+
ax1.set_ylabel('Density')
|
| 295 |
+
ax1.legend()
|
| 296 |
+
ax1.grid(True, alpha=0.3)
|
| 297 |
+
|
| 298 |
+
# Chart 2: Metrics comparison
|
| 299 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 300 |
+
|
| 301 |
+
complexities = []
|
| 302 |
+
precisions = []
|
| 303 |
+
mrrs = []
|
| 304 |
+
thresholds = []
|
| 305 |
+
|
| 306 |
+
for complexity, stats in complexity_stats.items():
|
| 307 |
+
if stats:
|
| 308 |
+
complexities.append(complexity.title())
|
| 309 |
+
precisions.append(stats['avg_precision'])
|
| 310 |
+
mrrs.append(stats['avg_mrr'])
|
| 311 |
+
thresholds.append(stats['avg_threshold'])
|
| 312 |
+
|
| 313 |
+
x = np.arange(len(complexities))
|
| 314 |
+
width = 0.35
|
| 315 |
+
|
| 316 |
+
bars1 = ax2.bar(x - width/2, precisions, width, label='Precision@K', alpha=0.8, color='#ff7f0e')
|
| 317 |
+
bars2 = ax2.bar(x + width/2, mrrs, width, label='MRR', alpha=0.8, color='#1f77b4')
|
| 318 |
+
|
| 319 |
+
ax2.set_title('Metrics Comparison by Complexity\n(with Adaptive Thresholds)', fontweight='bold')
|
| 320 |
+
ax2.set_ylabel('Score')
|
| 321 |
+
ax2.set_xlabel('Query Complexity')
|
| 322 |
+
ax2.set_xticks(x)
|
| 323 |
+
ax2.set_xticklabels(complexities)
|
| 324 |
+
ax2.legend()
|
| 325 |
+
ax2.grid(True, alpha=0.3)
|
| 326 |
+
ax2.set_ylim(0, 1.0)
|
| 327 |
+
|
| 328 |
+
# Add value labels
|
| 329 |
+
for bars, values, thresholds_vals in [(bars1, precisions, thresholds), (bars2, mrrs, thresholds)]:
|
| 330 |
+
for bar, value, threshold in zip(bars, values, thresholds_vals):
|
| 331 |
+
height = bar.get_height()
|
| 332 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 333 |
+
f'{value:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
|
| 334 |
+
|
| 335 |
+
plt.tight_layout()
|
| 336 |
+
|
| 337 |
+
# Save chart
|
| 338 |
+
if save_path is None:
|
| 339 |
+
save_path = Path(__file__).parent / "charts" / f"threshold_impact_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 340 |
+
|
| 341 |
+
save_path = Path(save_path)
|
| 342 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 343 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 344 |
+
plt.close()
|
| 345 |
+
|
| 346 |
+
print(f"📊 Threshold impact chart saved: {save_path}")
|
| 347 |
+
return str(save_path)
|
| 348 |
+
|
| 349 |
+
def create_detailed_analysis_table(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 350 |
+
"""Create detailed statistics table"""
|
| 351 |
+
|
| 352 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
| 353 |
+
ax.axis('tight')
|
| 354 |
+
ax.axis('off')
|
| 355 |
+
|
| 356 |
+
# Prepare table data
|
| 357 |
+
table_data = []
|
| 358 |
+
|
| 359 |
+
# Overall statistics
|
| 360 |
+
overall_stats = analysis_data['statistics']['overall_statistics']
|
| 361 |
+
table_data.append(['OVERALL METRICS', '', '', '', ''])
|
| 362 |
+
table_data.append(['Total Queries', str(overall_stats['total_queries']), '', '', ''])
|
| 363 |
+
table_data.append(['Avg Precision@K', f"{overall_stats['avg_precision']:.3f}",
|
| 364 |
+
f"±{overall_stats['precision_std']:.3f}", '', ''])
|
| 365 |
+
table_data.append(['Avg MRR', f"{overall_stats['avg_mrr']:.3f}",
|
| 366 |
+
f"±{overall_stats['mrr_std']:.3f}", '', ''])
|
| 367 |
+
table_data.append(['', '', '', '', ''])
|
| 368 |
+
|
| 369 |
+
# By category
|
| 370 |
+
table_data.append(['BY CATEGORY', 'Queries', 'Precision@K', 'MRR', 'Notes'])
|
| 371 |
+
category_stats = analysis_data['statistics']['by_category']
|
| 372 |
+
for category, stats in category_stats.items():
|
| 373 |
+
if stats:
|
| 374 |
+
table_data.append([
|
| 375 |
+
category.title(),
|
| 376 |
+
str(stats['query_count']),
|
| 377 |
+
f"{stats['avg_precision']:.3f}",
|
| 378 |
+
f"{stats['avg_mrr']:.3f}",
|
| 379 |
+
''
|
| 380 |
+
])
|
| 381 |
+
|
| 382 |
+
table_data.append(['', '', '', '', ''])
|
| 383 |
+
|
| 384 |
+
# By complexity
|
| 385 |
+
table_data.append(['BY COMPLEXITY', 'Queries', 'Precision@K', 'MRR', 'Threshold'])
|
| 386 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 387 |
+
for complexity, stats in complexity_stats.items():
|
| 388 |
+
if stats:
|
| 389 |
+
table_data.append([
|
| 390 |
+
complexity.title(),
|
| 391 |
+
str(stats['query_count']),
|
| 392 |
+
f"{stats['avg_precision']:.3f}",
|
| 393 |
+
f"{stats['avg_mrr']:.3f}",
|
| 394 |
+
f"{stats['avg_threshold']:.2f}"
|
| 395 |
+
])
|
| 396 |
+
|
| 397 |
+
# Create table
|
| 398 |
+
table = ax.table(cellText=table_data,
|
| 399 |
+
colLabels=['Metric', 'Value 1', 'Value 2', 'Value 3', 'Value 4'],
|
| 400 |
+
cellLoc='center',
|
| 401 |
+
loc='center',
|
| 402 |
+
bbox=[0, 0, 1, 1])
|
| 403 |
+
|
| 404 |
+
# Style the table
|
| 405 |
+
table.auto_set_font_size(False)
|
| 406 |
+
table.set_fontsize(10)
|
| 407 |
+
table.scale(1, 2)
|
| 408 |
+
|
| 409 |
+
# Header styling
|
| 410 |
+
for i in range(5):
|
| 411 |
+
table[(0, i)].set_facecolor('#40466e')
|
| 412 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 413 |
+
|
| 414 |
+
# Section headers styling
|
| 415 |
+
for i, row in enumerate(table_data):
|
| 416 |
+
if row[0] in ['OVERALL METRICS', 'BY CATEGORY', 'BY COMPLEXITY']:
|
| 417 |
+
table[(i+1, 0)].set_facecolor('#1f77b4')
|
| 418 |
+
table[(i+1, 0)].set_text_props(weight='bold', color='white')
|
| 419 |
+
|
| 420 |
+
plt.title('Precision@K & MRR Detailed Analysis\nMetrics 7-8 Statistics',
|
| 421 |
+
fontweight='bold', fontsize=14, pad=20)
|
| 422 |
+
|
| 423 |
+
# Save chart
|
| 424 |
+
if save_path is None:
|
| 425 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_table_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 426 |
+
|
| 427 |
+
save_path = Path(save_path)
|
| 428 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 429 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 430 |
+
plt.close()
|
| 431 |
+
|
| 432 |
+
print(f"📊 Detailed analysis table saved: {save_path}")
|
| 433 |
+
return str(save_path)
|
| 434 |
+
|
| 435 |
+
def create_individual_query_analysis(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 436 |
+
"""Create individual query analysis chart"""
|
| 437 |
+
|
| 438 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
| 439 |
+
|
| 440 |
+
if not detailed_results:
|
| 441 |
+
print("⚠️ No detailed results for individual analysis")
|
| 442 |
+
return ""
|
| 443 |
+
|
| 444 |
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
|
| 445 |
+
|
| 446 |
+
# Prepare data
|
| 447 |
+
query_indices = []
|
| 448 |
+
precisions = []
|
| 449 |
+
mrrs = []
|
| 450 |
+
colors = []
|
| 451 |
+
labels = []
|
| 452 |
+
|
| 453 |
+
for i, result in enumerate(detailed_results):
|
| 454 |
+
query_indices.append(i + 1)
|
| 455 |
+
precisions.append(result['precision_at_k'])
|
| 456 |
+
mrrs.append(result['mrr_score'])
|
| 457 |
+
|
| 458 |
+
# Color by complexity
|
| 459 |
+
if result['query_complexity'] == 'complex':
|
| 460 |
+
colors.append('#d62728') # Red for complex
|
| 461 |
+
else:
|
| 462 |
+
colors.append('#2ca02c') # Green for simple
|
| 463 |
+
|
| 464 |
+
# Create short label
|
| 465 |
+
query_short = result['query'][:30] + "..." if len(result['query']) > 30 else result['query']
|
| 466 |
+
category = result['category'][:4].upper()
|
| 467 |
+
labels.append(f"{category}\n{query_short}")
|
| 468 |
+
|
| 469 |
+
# Chart 1: Precision@K for each query
|
| 470 |
+
bars1 = ax1.bar(query_indices, precisions, color=colors, alpha=0.8)
|
| 471 |
+
ax1.set_title('Precision@K by Individual Query', fontweight='bold')
|
| 472 |
+
ax1.set_ylabel('Precision@K')
|
| 473 |
+
ax1.set_xlabel('Query Index')
|
| 474 |
+
ax1.set_ylim(0, 1.0)
|
| 475 |
+
ax1.grid(True, alpha=0.3)
|
| 476 |
+
|
| 477 |
+
# Add value labels
|
| 478 |
+
for bar, precision in zip(bars1, precisions):
|
| 479 |
+
height = bar.get_height()
|
| 480 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 481 |
+
f'{precision:.2f}', ha='center', va='bottom', fontsize=8)
|
| 482 |
+
|
| 483 |
+
# Chart 2: MRR for each query
|
| 484 |
+
bars2 = ax2.bar(query_indices, mrrs, color=colors, alpha=0.8)
|
| 485 |
+
ax2.set_title('MRR by Individual Query', fontweight='bold')
|
| 486 |
+
ax2.set_ylabel('MRR Score')
|
| 487 |
+
ax2.set_xlabel('Query Index')
|
| 488 |
+
ax2.set_ylim(0, 1.0)
|
| 489 |
+
ax2.grid(True, alpha=0.3)
|
| 490 |
+
|
| 491 |
+
# Add value labels
|
| 492 |
+
for bar, mrr in zip(bars2, mrrs):
|
| 493 |
+
height = bar.get_height()
|
| 494 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 495 |
+
f'{mrr:.2f}', ha='center', va='bottom', fontsize=8)
|
| 496 |
+
|
| 497 |
+
# Add legend
|
| 498 |
+
from matplotlib.patches import Patch
|
| 499 |
+
legend_elements = [
|
| 500 |
+
Patch(facecolor='#2ca02c', alpha=0.8, label='Simple Query (T=0.25)'),
|
| 501 |
+
Patch(facecolor='#d62728', alpha=0.8, label='Complex Query (T=0.15)')
|
| 502 |
+
]
|
| 503 |
+
ax1.legend(handles=legend_elements, loc='upper right')
|
| 504 |
+
|
| 505 |
+
plt.tight_layout()
|
| 506 |
+
|
| 507 |
+
# Save chart
|
| 508 |
+
if save_path is None:
|
| 509 |
+
save_path = Path(__file__).parent / "charts" / f"individual_query_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 510 |
+
|
| 511 |
+
save_path = Path(save_path)
|
| 512 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 513 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 514 |
+
plt.close()
|
| 515 |
+
|
| 516 |
+
print(f"📊 Individual query analysis saved: {save_path}")
|
| 517 |
+
return str(save_path)
|
| 518 |
+
|
| 519 |
+
def generate_all_charts(self, analysis_data: Dict = None) -> Dict[str, str]:
|
| 520 |
+
"""Generate all precision/MRR charts"""
|
| 521 |
+
|
| 522 |
+
if analysis_data is None:
|
| 523 |
+
analysis_data = self.load_latest_analysis()
|
| 524 |
+
|
| 525 |
+
print(f"\n📈 Generating all Precision & MRR charts...")
|
| 526 |
+
|
| 527 |
+
saved_charts = {}
|
| 528 |
+
|
| 529 |
+
# Generate all chart types
|
| 530 |
+
try:
|
| 531 |
+
saved_charts['precision_comparison'] = self.create_precision_comparison_chart(analysis_data)
|
| 532 |
+
saved_charts['mrr_comparison'] = self.create_mrr_comparison_chart(analysis_data)
|
| 533 |
+
saved_charts['combined_heatmap'] = self.create_combined_metrics_heatmap(analysis_data)
|
| 534 |
+
saved_charts['threshold_impact'] = self.create_threshold_impact_chart(analysis_data)
|
| 535 |
+
saved_charts['individual_analysis'] = self.create_individual_query_analysis(analysis_data)
|
| 536 |
+
|
| 537 |
+
except Exception as e:
|
| 538 |
+
print(f"❌ Error generating charts: {e}")
|
| 539 |
+
return {"error": str(e)}
|
| 540 |
+
|
| 541 |
+
print(f"\n✅ All precision/MRR charts generated successfully!")
|
| 542 |
+
print(f"📁 Charts saved to: evaluation/charts/")
|
| 543 |
+
|
| 544 |
+
return saved_charts
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
# Independent execution interface
|
| 548 |
+
if __name__ == "__main__":
|
| 549 |
+
"""Generate precision/MRR charts from analysis results"""
|
| 550 |
+
|
| 551 |
+
print("📈 OnCall.ai Precision & MRR Chart Generator - Metrics 7-8")
|
| 552 |
+
|
| 553 |
+
if len(sys.argv) > 1:
|
| 554 |
+
analysis_file = sys.argv[1]
|
| 555 |
+
|
| 556 |
+
if not os.path.exists(analysis_file):
|
| 557 |
+
print(f"❌ Analysis file not found: {analysis_file}")
|
| 558 |
+
sys.exit(1)
|
| 559 |
+
else:
|
| 560 |
+
analysis_file = None # Will use latest file
|
| 561 |
+
|
| 562 |
+
# Initialize generator
|
| 563 |
+
generator = PrecisionMRRChartGenerator()
|
| 564 |
+
|
| 565 |
+
try:
|
| 566 |
+
# Load analysis data
|
| 567 |
+
if analysis_file:
|
| 568 |
+
with open(analysis_file, 'r', encoding='utf-8') as f:
|
| 569 |
+
analysis_data = json.load(f)
|
| 570 |
+
print(f"📁 Using specified analysis file: {analysis_file}")
|
| 571 |
+
else:
|
| 572 |
+
analysis_data = generator.load_latest_analysis()
|
| 573 |
+
|
| 574 |
+
# Generate all charts
|
| 575 |
+
saved_charts = generator.generate_all_charts(analysis_data)
|
| 576 |
+
|
| 577 |
+
if 'error' not in saved_charts:
|
| 578 |
+
print(f"\n📊 === PRECISION & MRR CHART GENERATION SUMMARY ===")
|
| 579 |
+
for chart_type, filepath in saved_charts.items():
|
| 580 |
+
print(f" 📈 {chart_type.replace('_', ' ').title()}: {filepath}")
|
| 581 |
+
|
| 582 |
+
print(f"\n💡 Charts ready for analysis and presentation!")
|
| 583 |
+
|
| 584 |
+
except Exception as e:
|
| 585 |
+
print(f"❌ Chart generation failed: {e}")
|
| 586 |
+
sys.exit(1)
|
evaluation/old/coverage_evaluator.py
ADDED
|
@@ -0,0 +1,560 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Retrieval Coverage Evaluator (Metric 4)
|
| 4 |
+
==========================================================
|
| 5 |
+
|
| 6 |
+
Evaluates how well generated medical advice utilizes retrieved content
|
| 7 |
+
Automatic evaluation using keyword overlap analysis with optional LLM sampling
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any, Set
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import re
|
| 20 |
+
|
| 21 |
+
# Add project path
|
| 22 |
+
current_dir = Path(__file__).parent
|
| 23 |
+
project_root = current_dir.parent
|
| 24 |
+
src_dir = project_root / "src"
|
| 25 |
+
sys.path.insert(0, str(src_dir))
|
| 26 |
+
|
| 27 |
+
# Import existing system components
|
| 28 |
+
try:
|
| 29 |
+
from user_prompt import UserPromptProcessor
|
| 30 |
+
from retrieval import BasicRetrievalSystem
|
| 31 |
+
from llm_clients import llm_Med42_70BClient
|
| 32 |
+
from generation import MedicalAdviceGenerator
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
print(f"❌ Import failed: {e}")
|
| 35 |
+
print("Please ensure running from project root directory")
|
| 36 |
+
sys.exit(1)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class CoverageEvaluator:
|
| 40 |
+
"""Retrieval coverage evaluator using keyword overlap analysis"""
|
| 41 |
+
|
| 42 |
+
def __init__(self):
|
| 43 |
+
"""Initialize system components for coverage testing"""
|
| 44 |
+
print("🔧 Initializing Coverage Evaluator...")
|
| 45 |
+
|
| 46 |
+
# Initialize full pipeline components (needed for advice generation)
|
| 47 |
+
self.llm_client = llm_Med42_70BClient()
|
| 48 |
+
self.retrieval_system = BasicRetrievalSystem()
|
| 49 |
+
self.user_prompt_processor = UserPromptProcessor(
|
| 50 |
+
llm_client=self.llm_client,
|
| 51 |
+
retrieval_system=self.retrieval_system
|
| 52 |
+
)
|
| 53 |
+
self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
|
| 54 |
+
|
| 55 |
+
# Results accumulation
|
| 56 |
+
self.coverage_results = []
|
| 57 |
+
|
| 58 |
+
print("✅ Coverage Evaluator initialization complete")
|
| 59 |
+
|
| 60 |
+
def extract_medical_keywords(self, text: str) -> Set[str]:
|
| 61 |
+
"""
|
| 62 |
+
Extract medical keywords from text for coverage analysis
|
| 63 |
+
|
| 64 |
+
Uses medical terminology patterns and common medical terms
|
| 65 |
+
"""
|
| 66 |
+
if not text:
|
| 67 |
+
return set()
|
| 68 |
+
|
| 69 |
+
medical_keywords = set()
|
| 70 |
+
text_lower = text.lower()
|
| 71 |
+
|
| 72 |
+
# Medical terminology patterns
|
| 73 |
+
patterns = [
|
| 74 |
+
r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
|
| 75 |
+
r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
|
| 76 |
+
r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
|
| 77 |
+
r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
|
| 78 |
+
r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
|
| 79 |
+
r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
|
| 80 |
+
r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
|
| 81 |
+
r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
for pattern in patterns:
|
| 85 |
+
matches = re.findall(pattern, text_lower)
|
| 86 |
+
medical_keywords.update(match.strip() for match in matches)
|
| 87 |
+
|
| 88 |
+
# Additional common medical terms
|
| 89 |
+
common_medical_terms = [
|
| 90 |
+
'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
|
| 91 |
+
'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
|
| 92 |
+
'protocol', 'guideline', 'recommendation', 'risk', 'factor'
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
for term in common_medical_terms:
|
| 96 |
+
if term in text_lower:
|
| 97 |
+
medical_keywords.add(term)
|
| 98 |
+
|
| 99 |
+
# Filter out very short terms and common words
|
| 100 |
+
filtered_keywords = {
|
| 101 |
+
kw for kw in medical_keywords
|
| 102 |
+
if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
return filtered_keywords
|
| 106 |
+
|
| 107 |
+
def calculate_coverage_score(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
|
| 108 |
+
"""
|
| 109 |
+
Calculate coverage score based on keyword overlap between advice and retrieved docs
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
generated_advice: Generated medical advice text
|
| 113 |
+
retrieval_results: List of retrieved documents
|
| 114 |
+
"""
|
| 115 |
+
if not generated_advice or not retrieval_results:
|
| 116 |
+
return {
|
| 117 |
+
"coverage_score": 0.0,
|
| 118 |
+
"matched_keywords": [],
|
| 119 |
+
"advice_keywords": [],
|
| 120 |
+
"source_keywords": [],
|
| 121 |
+
"coverage_details": []
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
# Extract keywords from generated advice
|
| 125 |
+
advice_keywords = self.extract_medical_keywords(generated_advice)
|
| 126 |
+
|
| 127 |
+
# Extract keywords from all retrieved documents
|
| 128 |
+
all_source_keywords = set()
|
| 129 |
+
coverage_details = []
|
| 130 |
+
|
| 131 |
+
for i, doc in enumerate(retrieval_results):
|
| 132 |
+
doc_content = doc.get('content', '') or doc.get('text', '')
|
| 133 |
+
doc_keywords = self.extract_medical_keywords(doc_content)
|
| 134 |
+
all_source_keywords.update(doc_keywords)
|
| 135 |
+
|
| 136 |
+
# Calculate overlap for this specific document
|
| 137 |
+
doc_overlap = advice_keywords.intersection(doc_keywords)
|
| 138 |
+
doc_coverage = len(doc_overlap) / len(doc_keywords) if doc_keywords else 0.0
|
| 139 |
+
|
| 140 |
+
coverage_details.append({
|
| 141 |
+
"doc_index": i,
|
| 142 |
+
"doc_snippet": doc_content[:100] + "...",
|
| 143 |
+
"doc_keywords_count": len(doc_keywords),
|
| 144 |
+
"matched_keywords_count": len(doc_overlap),
|
| 145 |
+
"doc_coverage_ratio": doc_coverage,
|
| 146 |
+
"matched_keywords": list(doc_overlap)[:10] # Limit for readability
|
| 147 |
+
})
|
| 148 |
+
|
| 149 |
+
# Calculate overall coverage
|
| 150 |
+
matched_keywords = advice_keywords.intersection(all_source_keywords)
|
| 151 |
+
coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
|
| 152 |
+
|
| 153 |
+
return {
|
| 154 |
+
"coverage_score": coverage_score,
|
| 155 |
+
"matched_keywords": list(matched_keywords),
|
| 156 |
+
"advice_keywords": list(advice_keywords),
|
| 157 |
+
"source_keywords": list(all_source_keywords),
|
| 158 |
+
"advice_keywords_count": len(advice_keywords),
|
| 159 |
+
"source_keywords_count": len(all_source_keywords),
|
| 160 |
+
"matched_keywords_count": len(matched_keywords),
|
| 161 |
+
"coverage_percentage": coverage_score * 100,
|
| 162 |
+
"meets_threshold": coverage_score >= 0.6,
|
| 163 |
+
"coverage_details": coverage_details
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
def evaluate_single_coverage(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
| 167 |
+
"""
|
| 168 |
+
Evaluate retrieval coverage for a single query
|
| 169 |
+
|
| 170 |
+
Requires full pipeline: extraction → retrieval → generation → coverage analysis
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
query: Medical query to test
|
| 174 |
+
category: Query category (diagnosis/treatment/mixed)
|
| 175 |
+
"""
|
| 176 |
+
print(f"🔍 Testing coverage for: {query[:50]}...")
|
| 177 |
+
print(f"📋 Category: {category}")
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
# Step 1: Extract condition
|
| 181 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
| 182 |
+
|
| 183 |
+
# Step 2: Perform retrieval
|
| 184 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
| 185 |
+
if not search_query:
|
| 186 |
+
search_query = condition_result.get('condition', query)
|
| 187 |
+
|
| 188 |
+
retrieval_start = datetime.now()
|
| 189 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
| 190 |
+
retrieval_time = (datetime.now() - retrieval_start).total_seconds()
|
| 191 |
+
|
| 192 |
+
processed_results = retrieval_results.get('processed_results', [])
|
| 193 |
+
|
| 194 |
+
if not processed_results:
|
| 195 |
+
result = {
|
| 196 |
+
"query": query,
|
| 197 |
+
"category": category,
|
| 198 |
+
"search_query": search_query,
|
| 199 |
+
"pipeline_success": False,
|
| 200 |
+
"coverage_score": 0.0,
|
| 201 |
+
"error": "No retrieval results",
|
| 202 |
+
"timestamp": datetime.now().isoformat()
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
self.coverage_results.append(result)
|
| 206 |
+
print(f" ❌ No retrieval results for coverage analysis")
|
| 207 |
+
return result
|
| 208 |
+
|
| 209 |
+
# Step 3: Generate medical advice
|
| 210 |
+
generation_start = datetime.now()
|
| 211 |
+
intention = self._detect_query_intention(query)
|
| 212 |
+
medical_advice_result = self.medical_generator.generate_medical_advice(
|
| 213 |
+
user_query=query,
|
| 214 |
+
retrieval_results=retrieval_results,
|
| 215 |
+
intention=intention
|
| 216 |
+
)
|
| 217 |
+
generation_time = (datetime.now() - generation_start).total_seconds()
|
| 218 |
+
|
| 219 |
+
generated_advice = medical_advice_result.get('medical_advice', '')
|
| 220 |
+
|
| 221 |
+
if not generated_advice:
|
| 222 |
+
result = {
|
| 223 |
+
"query": query,
|
| 224 |
+
"category": category,
|
| 225 |
+
"search_query": search_query,
|
| 226 |
+
"pipeline_success": False,
|
| 227 |
+
"coverage_score": 0.0,
|
| 228 |
+
"error": "No generated advice",
|
| 229 |
+
"timestamp": datetime.now().isoformat()
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
self.coverage_results.append(result)
|
| 233 |
+
print(f" ❌ No generated advice for coverage analysis")
|
| 234 |
+
return result
|
| 235 |
+
|
| 236 |
+
# Step 4: Calculate coverage
|
| 237 |
+
coverage_analysis = self.calculate_coverage_score(generated_advice, processed_results)
|
| 238 |
+
|
| 239 |
+
result = {
|
| 240 |
+
"query": query,
|
| 241 |
+
"category": category,
|
| 242 |
+
"search_query": search_query,
|
| 243 |
+
"pipeline_success": True,
|
| 244 |
+
"retrieval_time": retrieval_time,
|
| 245 |
+
"generation_time": generation_time,
|
| 246 |
+
"retrieved_docs_count": len(processed_results),
|
| 247 |
+
"generated_advice_length": len(generated_advice),
|
| 248 |
+
"coverage_analysis": coverage_analysis,
|
| 249 |
+
"coverage_score": coverage_analysis['coverage_score'],
|
| 250 |
+
"meets_threshold": coverage_analysis['meets_threshold'],
|
| 251 |
+
"timestamp": datetime.now().isoformat()
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
# Store result
|
| 255 |
+
self.coverage_results.append(result)
|
| 256 |
+
|
| 257 |
+
print(f" ✅ Pipeline: Complete")
|
| 258 |
+
print(f" 📊 Coverage Score: {coverage_analysis['coverage_score']:.3f} ({coverage_analysis['coverage_percentage']:.1f}%)")
|
| 259 |
+
print(f" 📝 Keywords: {coverage_analysis['matched_keywords_count']}/{coverage_analysis['source_keywords_count']} matched")
|
| 260 |
+
print(f" 🎯 Threshold: {'✅ Met' if result['meets_threshold'] else '❌ Not Met'}")
|
| 261 |
+
print(f" ⏱️ Times: Retrieval={retrieval_time:.2f}s, Generation={generation_time:.2f}s")
|
| 262 |
+
|
| 263 |
+
return result
|
| 264 |
+
|
| 265 |
+
except Exception as e:
|
| 266 |
+
error_result = {
|
| 267 |
+
"query": query,
|
| 268 |
+
"category": category,
|
| 269 |
+
"pipeline_success": False,
|
| 270 |
+
"coverage_score": 0.0,
|
| 271 |
+
"error": str(e),
|
| 272 |
+
"timestamp": datetime.now().isoformat()
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
self.coverage_results.append(error_result)
|
| 276 |
+
print(f" ❌ Coverage evaluation failed: {e}")
|
| 277 |
+
|
| 278 |
+
return error_result
|
| 279 |
+
|
| 280 |
+
def _detect_query_intention(self, query: str) -> str:
|
| 281 |
+
"""Simplified query intention detection (from app.py)"""
|
| 282 |
+
query_lower = query.lower()
|
| 283 |
+
|
| 284 |
+
if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
|
| 285 |
+
return 'diagnosis'
|
| 286 |
+
elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
|
| 287 |
+
return 'treatment'
|
| 288 |
+
else:
|
| 289 |
+
return 'mixed'
|
| 290 |
+
|
| 291 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
| 292 |
+
"""Parse queries from file with category labels"""
|
| 293 |
+
print(f"📁 Reading queries from file: {filepath}")
|
| 294 |
+
|
| 295 |
+
try:
|
| 296 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 297 |
+
content = f.read()
|
| 298 |
+
|
| 299 |
+
# Parse queries with category labels
|
| 300 |
+
queries_by_category = {
|
| 301 |
+
"diagnosis": [],
|
| 302 |
+
"treatment": [],
|
| 303 |
+
"mixed": []
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
lines = content.strip().split('\n')
|
| 307 |
+
|
| 308 |
+
for line in lines:
|
| 309 |
+
line = line.strip()
|
| 310 |
+
if not line:
|
| 311 |
+
continue
|
| 312 |
+
|
| 313 |
+
# Parse format: "1.diagnosis: query text"
|
| 314 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
| 315 |
+
if match:
|
| 316 |
+
category_raw = match.group(1).lower()
|
| 317 |
+
query_text = match.group(2).strip()
|
| 318 |
+
|
| 319 |
+
# Normalize category name
|
| 320 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
| 321 |
+
category = 'mixed'
|
| 322 |
+
else:
|
| 323 |
+
category = category_raw
|
| 324 |
+
|
| 325 |
+
if category in queries_by_category and len(query_text) > 15:
|
| 326 |
+
queries_by_category[category].append({
|
| 327 |
+
"text": query_text,
|
| 328 |
+
"category": category
|
| 329 |
+
})
|
| 330 |
+
|
| 331 |
+
print(f"📋 Parsed queries by category:")
|
| 332 |
+
for category, category_queries in queries_by_category.items():
|
| 333 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
| 334 |
+
|
| 335 |
+
return queries_by_category
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
print(f"❌ Failed to read file: {e}")
|
| 339 |
+
return {"error": f"Failed to read file: {e}"}
|
| 340 |
+
|
| 341 |
+
def calculate_coverage_statistics(self) -> Dict[str, Any]:
|
| 342 |
+
"""Calculate coverage statistics by category"""
|
| 343 |
+
category_stats = {}
|
| 344 |
+
all_successful_results = []
|
| 345 |
+
|
| 346 |
+
# Group results by category
|
| 347 |
+
results_by_category = {
|
| 348 |
+
"diagnosis": [],
|
| 349 |
+
"treatment": [],
|
| 350 |
+
"mixed": []
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
for result in self.coverage_results:
|
| 354 |
+
category = result.get('category', 'unknown')
|
| 355 |
+
if category in results_by_category:
|
| 356 |
+
results_by_category[category].append(result)
|
| 357 |
+
if result.get('pipeline_success'):
|
| 358 |
+
all_successful_results.append(result)
|
| 359 |
+
|
| 360 |
+
# Calculate statistics for each category
|
| 361 |
+
for category, results in results_by_category.items():
|
| 362 |
+
successful_results = [r for r in results if r.get('pipeline_success')]
|
| 363 |
+
|
| 364 |
+
if successful_results:
|
| 365 |
+
coverage_scores = [r['coverage_score'] for r in successful_results]
|
| 366 |
+
avg_coverage = sum(coverage_scores) / len(coverage_scores)
|
| 367 |
+
avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
|
| 368 |
+
avg_generation_time = sum(r.get('generation_time', 0) for r in successful_results) / len(successful_results)
|
| 369 |
+
|
| 370 |
+
category_stats[category] = {
|
| 371 |
+
"average_coverage": avg_coverage,
|
| 372 |
+
"max_coverage": max(coverage_scores),
|
| 373 |
+
"min_coverage": min(coverage_scores),
|
| 374 |
+
"successful_evaluations": len(successful_results),
|
| 375 |
+
"total_queries": len(results),
|
| 376 |
+
"success_rate": len(successful_results) / len(results),
|
| 377 |
+
"average_retrieval_time": avg_retrieval_time,
|
| 378 |
+
"average_generation_time": avg_generation_time,
|
| 379 |
+
"meets_threshold": avg_coverage >= 0.6,
|
| 380 |
+
"individual_coverage_scores": coverage_scores
|
| 381 |
+
}
|
| 382 |
+
else:
|
| 383 |
+
category_stats[category] = {
|
| 384 |
+
"average_coverage": 0.0,
|
| 385 |
+
"max_coverage": 0.0,
|
| 386 |
+
"min_coverage": 0.0,
|
| 387 |
+
"successful_evaluations": 0,
|
| 388 |
+
"total_queries": len(results),
|
| 389 |
+
"success_rate": 0.0,
|
| 390 |
+
"average_retrieval_time": 0.0,
|
| 391 |
+
"average_generation_time": 0.0,
|
| 392 |
+
"meets_threshold": False,
|
| 393 |
+
"individual_coverage_scores": []
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
# Calculate overall statistics
|
| 397 |
+
if all_successful_results:
|
| 398 |
+
all_coverage_scores = [r['coverage_score'] for r in all_successful_results]
|
| 399 |
+
overall_stats = {
|
| 400 |
+
"average_coverage": sum(all_coverage_scores) / len(all_coverage_scores),
|
| 401 |
+
"max_coverage": max(all_coverage_scores),
|
| 402 |
+
"min_coverage": min(all_coverage_scores),
|
| 403 |
+
"successful_evaluations": len(all_successful_results),
|
| 404 |
+
"total_queries": len(self.coverage_results),
|
| 405 |
+
"success_rate": len(all_successful_results) / len(self.coverage_results),
|
| 406 |
+
"meets_threshold": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6,
|
| 407 |
+
"target_compliance": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6
|
| 408 |
+
}
|
| 409 |
+
else:
|
| 410 |
+
overall_stats = {
|
| 411 |
+
"average_coverage": 0.0,
|
| 412 |
+
"max_coverage": 0.0,
|
| 413 |
+
"min_coverage": 0.0,
|
| 414 |
+
"successful_evaluations": 0,
|
| 415 |
+
"total_queries": len(self.coverage_results),
|
| 416 |
+
"success_rate": 0.0,
|
| 417 |
+
"meets_threshold": False,
|
| 418 |
+
"target_compliance": False
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
return {
|
| 422 |
+
"category_results": category_stats,
|
| 423 |
+
"overall_results": overall_stats,
|
| 424 |
+
"timestamp": datetime.now().isoformat()
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
def save_coverage_statistics(self, filename: str = None) -> str:
|
| 428 |
+
"""Save coverage statistics for chart generation"""
|
| 429 |
+
stats = self.calculate_coverage_statistics()
|
| 430 |
+
|
| 431 |
+
if filename is None:
|
| 432 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 433 |
+
filename = f"coverage_statistics_{timestamp}.json"
|
| 434 |
+
|
| 435 |
+
# Ensure results directory exists
|
| 436 |
+
results_dir = Path(__file__).parent / "results"
|
| 437 |
+
results_dir.mkdir(exist_ok=True)
|
| 438 |
+
|
| 439 |
+
filepath = results_dir / filename
|
| 440 |
+
|
| 441 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 442 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 443 |
+
|
| 444 |
+
print(f"📊 Coverage statistics saved to: {filepath}")
|
| 445 |
+
return str(filepath)
|
| 446 |
+
|
| 447 |
+
def save_coverage_details(self, filename: str = None) -> str:
|
| 448 |
+
"""Save detailed coverage results"""
|
| 449 |
+
if filename is None:
|
| 450 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 451 |
+
filename = f"coverage_details_{timestamp}.json"
|
| 452 |
+
|
| 453 |
+
# Ensure results directory exists
|
| 454 |
+
results_dir = Path(__file__).parent / "results"
|
| 455 |
+
results_dir.mkdir(exist_ok=True)
|
| 456 |
+
|
| 457 |
+
filepath = results_dir / filename
|
| 458 |
+
|
| 459 |
+
# Create comprehensive coverage data
|
| 460 |
+
coverage_data = {
|
| 461 |
+
"evaluation_metadata": {
|
| 462 |
+
"total_queries": len(self.coverage_results),
|
| 463 |
+
"successful_evaluations": len([r for r in self.coverage_results if r.get('pipeline_success')]),
|
| 464 |
+
"timestamp": datetime.now().isoformat(),
|
| 465 |
+
"evaluator_type": "retrieval_coverage",
|
| 466 |
+
"threshold_used": 0.6
|
| 467 |
+
},
|
| 468 |
+
"coverage_results": self.coverage_results
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 472 |
+
json.dump(coverage_data, f, indent=2, ensure_ascii=False)
|
| 473 |
+
|
| 474 |
+
print(f"📝 Coverage details saved to: {filepath}")
|
| 475 |
+
return str(filepath)
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
# Independent execution interface
|
| 479 |
+
if __name__ == "__main__":
|
| 480 |
+
"""Independent coverage evaluation interface"""
|
| 481 |
+
|
| 482 |
+
print("📈 OnCall.ai Coverage Evaluator - Retrieval Coverage Analysis")
|
| 483 |
+
|
| 484 |
+
if len(sys.argv) > 1:
|
| 485 |
+
query_file = sys.argv[1]
|
| 486 |
+
else:
|
| 487 |
+
# Default to evaluation/pre_user_query_evaluate.txt
|
| 488 |
+
query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
|
| 489 |
+
|
| 490 |
+
if not os.path.exists(query_file):
|
| 491 |
+
print(f"❌ Query file not found: {query_file}")
|
| 492 |
+
print("Usage: python coverage_evaluator.py [query_file.txt]")
|
| 493 |
+
sys.exit(1)
|
| 494 |
+
|
| 495 |
+
# Initialize evaluator
|
| 496 |
+
evaluator = CoverageEvaluator()
|
| 497 |
+
|
| 498 |
+
# Parse queries from file
|
| 499 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
| 500 |
+
|
| 501 |
+
if "error" in queries_by_category:
|
| 502 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
| 503 |
+
sys.exit(1)
|
| 504 |
+
|
| 505 |
+
# Test coverage for each query (requires full pipeline)
|
| 506 |
+
print(f"\n🧪 Retrieval Coverage Testing (Full Pipeline Required)")
|
| 507 |
+
print(f"⚠️ Note: This evaluator requires LLM calls for advice generation")
|
| 508 |
+
|
| 509 |
+
for category, queries in queries_by_category.items():
|
| 510 |
+
if not queries:
|
| 511 |
+
continue
|
| 512 |
+
|
| 513 |
+
print(f"\n📂 Testing {category.upper()} coverage:")
|
| 514 |
+
|
| 515 |
+
for i, query_info in enumerate(queries):
|
| 516 |
+
query_text = query_info['text']
|
| 517 |
+
|
| 518 |
+
# Test coverage (requires full pipeline)
|
| 519 |
+
result = evaluator.evaluate_single_coverage(query_text, category)
|
| 520 |
+
|
| 521 |
+
# Pause between queries to avoid rate limits
|
| 522 |
+
if i < len(queries) - 1:
|
| 523 |
+
print(f" ⏳ Pausing 5s before next query...")
|
| 524 |
+
import time
|
| 525 |
+
time.sleep(5)
|
| 526 |
+
|
| 527 |
+
# Longer pause between categories
|
| 528 |
+
if category != list(queries_by_category.keys())[-1]:
|
| 529 |
+
print(f"\n⏳ Pausing 10s before next category...")
|
| 530 |
+
import time
|
| 531 |
+
time.sleep(10)
|
| 532 |
+
|
| 533 |
+
# Generate and save results
|
| 534 |
+
print(f"\n📊 Generating coverage analysis...")
|
| 535 |
+
|
| 536 |
+
# Save statistics and details
|
| 537 |
+
stats_path = evaluator.save_coverage_statistics()
|
| 538 |
+
details_path = evaluator.save_coverage_details()
|
| 539 |
+
|
| 540 |
+
# Print final summary
|
| 541 |
+
stats = evaluator.calculate_coverage_statistics()
|
| 542 |
+
category_results = stats['category_results']
|
| 543 |
+
overall_results = stats['overall_results']
|
| 544 |
+
|
| 545 |
+
print(f"\n📊 === COVERAGE EVALUATION SUMMARY ===")
|
| 546 |
+
print(f"Overall Performance:")
|
| 547 |
+
print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
|
| 548 |
+
print(f" Pipeline Success Rate: {overall_results['success_rate']:.1%}")
|
| 549 |
+
print(f" 60% Threshold: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
|
| 550 |
+
|
| 551 |
+
print(f"\nCategory Breakdown:")
|
| 552 |
+
for category, cat_stats in category_results.items():
|
| 553 |
+
if cat_stats['total_queries'] > 0:
|
| 554 |
+
print(f" {category.capitalize()}: {cat_stats['average_coverage']:.3f} "
|
| 555 |
+
f"({cat_stats['successful_evaluations']}/{cat_stats['total_queries']}) "
|
| 556 |
+
f"[R:{cat_stats['average_retrieval_time']:.2f}s, G:{cat_stats['average_generation_time']:.2f}s]")
|
| 557 |
+
|
| 558 |
+
print(f"\n✅ Coverage evaluation complete!")
|
| 559 |
+
print(f"📊 Statistics: {stats_path}")
|
| 560 |
+
print(f"📝 Details: {details_path}")
|
evaluation/{evaluation_instruction.md → old/evaluation_instruction.md}
RENAMED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# Model use
|
|
|
|
| 2 |
llm model: (for comparison) with our-own version.
|
| 3 |
https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
|
| 4 |
https://huggingface.co/m42-health/Llama3-Med42-70B
|
|
@@ -12,59 +13,59 @@ https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct
|
|
| 12 |
"""
|
| 13 |
```
|
| 14 |
|
| 15 |
-
|
| 16 |
### 評估執行流程
|
|
|
|
| 17 |
```python
|
| 18 |
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
| 19 |
"""執行完整的六項指標評估"""
|
| 20 |
-
|
| 21 |
results = {
|
| 22 |
"model": model_name,
|
| 23 |
"metrics": {},
|
| 24 |
"detailed_results": []
|
| 25 |
}
|
| 26 |
-
|
| 27 |
total_latencies = []
|
| 28 |
extraction_successes = []
|
| 29 |
relevance_scores = []
|
| 30 |
coverage_scores = []
|
| 31 |
actionability_scores = []
|
| 32 |
evidence_scores = []
|
| 33 |
-
|
| 34 |
for query in test_cases:
|
| 35 |
# 運行模型並測量所有指標
|
| 36 |
start_time = time.time()
|
| 37 |
-
|
| 38 |
# 1. 總處理時長
|
| 39 |
latency_result = measure_total_latency(query)
|
| 40 |
total_latencies.append(latency_result['total_latency'])
|
| 41 |
-
|
| 42 |
# 2. 條件抽取成功率
|
| 43 |
extraction_result = evaluate_condition_extraction([query])
|
| 44 |
extraction_successes.append(extraction_result['success_rate'])
|
| 45 |
-
|
| 46 |
# 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
|
| 47 |
retrieval_results = get_retrieval_results(query)
|
| 48 |
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
| 49 |
relevance_scores.append(relevance_result['average_relevance'])
|
| 50 |
-
|
| 51 |
generated_advice = get_generated_advice(query, retrieval_results)
|
| 52 |
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
| 53 |
coverage_scores.append(coverage_result['coverage'])
|
| 54 |
-
|
| 55 |
# 5 & 6. LLM 評估(需要完整回應)
|
| 56 |
response_data = {
|
| 57 |
'query': query,
|
| 58 |
'advice': generated_advice,
|
| 59 |
'retrieval_results': retrieval_results
|
| 60 |
}
|
| 61 |
-
|
| 62 |
actionability_result = evaluate_clinical_actionability([response_data])
|
| 63 |
actionability_scores.append(actionability_result[0]['overall_score'])
|
| 64 |
-
|
| 65 |
evidence_result = evaluate_clinical_evidence([response_data])
|
| 66 |
evidence_scores.append(evidence_result[0]['overall_score'])
|
| 67 |
-
|
| 68 |
# 記錄詳細結果
|
| 69 |
results["detailed_results"].append({
|
| 70 |
"query": query,
|
|
@@ -75,7 +76,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
| 75 |
"actionability": actionability_result[0],
|
| 76 |
"evidence": evidence_result[0]
|
| 77 |
})
|
| 78 |
-
|
| 79 |
# 計算平均指標
|
| 80 |
results["metrics"] = {
|
| 81 |
"average_latency": sum(total_latencies) / len(total_latencies),
|
|
@@ -85,7 +86,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
| 85 |
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
| 86 |
"average_evidence_score": sum(evidence_scores) / len(evidence_scores)
|
| 87 |
}
|
| 88 |
-
|
| 89 |
return results
|
| 90 |
```
|
| 91 |
|
|
@@ -94,41 +95,43 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
| 94 |
## 📈 評估結果分析框架
|
| 95 |
|
| 96 |
### 統計分析
|
|
|
|
| 97 |
```python
|
| 98 |
def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
|
| 99 |
"""比較三個模型的評估結果"""
|
| 100 |
-
|
| 101 |
models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
|
| 102 |
metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
|
| 103 |
-
|
| 104 |
comparison = {}
|
| 105 |
-
|
| 106 |
for metric in metrics:
|
| 107 |
comparison[metric] = {
|
| 108 |
models[0]: results_A['metrics'][f'average_{metric}'],
|
| 109 |
models[1]: results_B['metrics'][f'average_{metric}'],
|
| 110 |
models[2]: results_C['metrics'][f'average_{metric}']
|
| 111 |
}
|
| 112 |
-
|
| 113 |
# 計算相對改進
|
| 114 |
baseline = comparison[metric][models[0]]
|
| 115 |
rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
|
| 116 |
-
|
| 117 |
comparison[metric]['rag_improvement_percent'] = rag_improvement
|
| 118 |
-
|
| 119 |
return comparison
|
| 120 |
```
|
| 121 |
|
| 122 |
### 報告生成
|
|
|
|
| 123 |
```python
|
| 124 |
def generate_evaluation_report(comparison_results: Dict) -> str:
|
| 125 |
"""生成評估報告"""
|
| 126 |
-
|
| 127 |
report = f"""
|
| 128 |
# OnCall.ai 系統評估報告
|
| 129 |
-
|
| 130 |
## 評估摘要
|
| 131 |
-
|
| 132 |
| 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
|
| 133 |
|------|-----------|-----------|------------|----------|
|
| 134 |
| 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
|
|
@@ -137,9 +140,9 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
|
|
| 137 |
| 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
|
| 138 |
| 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
|
| 139 |
| 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
|
| 140 |
-
|
| 141 |
"""
|
| 142 |
-
|
| 143 |
return report
|
| 144 |
```
|
| 145 |
|
|
@@ -148,6 +151,7 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
|
|
| 148 |
## 🔧 實驗執行步驟
|
| 149 |
|
| 150 |
### 1. 環境準備
|
|
|
|
| 151 |
```bash
|
| 152 |
# 設置 HuggingFace token(用於 Inference Providers)
|
| 153 |
export HF_TOKEN=your_huggingface_token
|
|
@@ -157,48 +161,49 @@ export ONCALL_EVAL_MODE=true
|
|
| 157 |
```
|
| 158 |
|
| 159 |
### 2. 實驗執行腳本框架
|
|
|
|
| 160 |
```python
|
| 161 |
# evaluation/run_evaluation.py
|
| 162 |
def main():
|
| 163 |
"""主要評估執行函數"""
|
| 164 |
-
|
| 165 |
# 加載測試用例
|
| 166 |
test_cases = MEDICAL_TEST_CASES
|
| 167 |
-
|
| 168 |
# 實驗 A: YanBo 系統評估
|
| 169 |
print("🔬 開始實驗 A: YanBo 系統評估")
|
| 170 |
results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
|
| 171 |
-
results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
|
| 172 |
results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
| 173 |
-
|
| 174 |
# 分析和報告
|
| 175 |
comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
|
| 176 |
report_A = generate_evaluation_report(comparison_A)
|
| 177 |
-
|
| 178 |
# 保存結果
|
| 179 |
save_results("evaluation/results/yanbo_evaluation.json", {
|
| 180 |
"comparison": comparison_A,
|
| 181 |
"detailed_results": [results_med42_direct, results_general_rag, results_openbio]
|
| 182 |
})
|
| 183 |
-
|
| 184 |
print("✅ 實驗 A 完成,結果已保存")
|
| 185 |
-
|
| 186 |
# 實驗 B: Jeff 系統評估
|
| 187 |
print("🔬 開始實驗 B: Jeff 系統評估")
|
| 188 |
results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
|
| 189 |
results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
|
| 190 |
results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
| 191 |
-
|
| 192 |
# 分析和報告
|
| 193 |
comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
|
| 194 |
report_B = generate_evaluation_report(comparison_B)
|
| 195 |
-
|
| 196 |
# 保存結果
|
| 197 |
save_results("evaluation/results/jeff_evaluation.json", {
|
| 198 |
"comparison": comparison_B,
|
| 199 |
"detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
|
| 200 |
})
|
| 201 |
-
|
| 202 |
print("✅ 實驗 B 完成,結果已保存")
|
| 203 |
|
| 204 |
if __name__ == "__main__":
|
|
@@ -206,6 +211,7 @@ if __name__ == "__main__":
|
|
| 206 |
```
|
| 207 |
|
| 208 |
### 3. 預期評估時間
|
|
|
|
| 209 |
```
|
| 210 |
總評估時間估算:
|
| 211 |
├── 每個查詢處理時間:~30秒(包含LLM評估)
|
|
@@ -219,10 +225,11 @@ if __name__ == "__main__":
|
|
| 219 |
## 📊 評估成功標準
|
| 220 |
|
| 221 |
### 系統性能目標
|
|
|
|
| 222 |
```
|
| 223 |
✅ 達標條件:
|
| 224 |
1. 總處理時長 ≤ 30秒
|
| 225 |
-
2. 條件抽取成功率 ≥ 80%
|
| 226 |
3. 檢索相關性 ≥ 0.2
|
| 227 |
4. 檢索覆蓋率 ≥ 60%
|
| 228 |
5. 臨床可操作性 ≥ 7.0/10
|
|
@@ -234,6 +241,7 @@ if __name__ == "__main__":
|
|
| 234 |
```
|
| 235 |
|
| 236 |
### 比較分析重點
|
|
|
|
| 237 |
```
|
| 238 |
重點分析維度:
|
| 239 |
├── RAG 對處理時間的影響(可能增加延遲)
|
|
@@ -247,6 +255,7 @@ if __name__ == "__main__":
|
|
| 247 |
## 🛠️ 實施建議
|
| 248 |
|
| 249 |
### 分階段實施
|
|
|
|
| 250 |
```
|
| 251 |
階段1: 基礎指標實現(1-4項)
|
| 252 |
├── 利用現有 app.py 中的時間測量
|
|
@@ -268,6 +277,7 @@ if __name__ == "__main__":
|
|
| 268 |
```
|
| 269 |
|
| 270 |
### 實施注意事項
|
|
|
|
| 271 |
```
|
| 272 |
⚠️ 重要提醒:
|
| 273 |
1. 所有評估代碼應獨立於現有系統,避免影響正常運行
|
|
@@ -280,3 +290,412 @@ if __name__ == "__main__":
|
|
| 280 |
---
|
| 281 |
|
| 282 |
**評估指南完成。請根據此指南實施評估實驗。**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Model use
|
| 2 |
+
|
| 3 |
llm model: (for comparison) with our-own version.
|
| 4 |
https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
|
| 5 |
https://huggingface.co/m42-health/Llama3-Med42-70B
|
|
|
|
| 13 |
"""
|
| 14 |
```
|
| 15 |
|
|
|
|
| 16 |
### 評估執行流程
|
| 17 |
+
|
| 18 |
```python
|
| 19 |
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
| 20 |
"""執行完整的六項指標評估"""
|
| 21 |
+
|
| 22 |
results = {
|
| 23 |
"model": model_name,
|
| 24 |
"metrics": {},
|
| 25 |
"detailed_results": []
|
| 26 |
}
|
| 27 |
+
|
| 28 |
total_latencies = []
|
| 29 |
extraction_successes = []
|
| 30 |
relevance_scores = []
|
| 31 |
coverage_scores = []
|
| 32 |
actionability_scores = []
|
| 33 |
evidence_scores = []
|
| 34 |
+
|
| 35 |
for query in test_cases:
|
| 36 |
# 運行模型並測量所有指標
|
| 37 |
start_time = time.time()
|
| 38 |
+
|
| 39 |
# 1. 總處理時長
|
| 40 |
latency_result = measure_total_latency(query)
|
| 41 |
total_latencies.append(latency_result['total_latency'])
|
| 42 |
+
|
| 43 |
# 2. 條件抽取成功率
|
| 44 |
extraction_result = evaluate_condition_extraction([query])
|
| 45 |
extraction_successes.append(extraction_result['success_rate'])
|
| 46 |
+
|
| 47 |
# 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
|
| 48 |
retrieval_results = get_retrieval_results(query)
|
| 49 |
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
| 50 |
relevance_scores.append(relevance_result['average_relevance'])
|
| 51 |
+
|
| 52 |
generated_advice = get_generated_advice(query, retrieval_results)
|
| 53 |
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
| 54 |
coverage_scores.append(coverage_result['coverage'])
|
| 55 |
+
|
| 56 |
# 5 & 6. LLM 評估(需要完整回應)
|
| 57 |
response_data = {
|
| 58 |
'query': query,
|
| 59 |
'advice': generated_advice,
|
| 60 |
'retrieval_results': retrieval_results
|
| 61 |
}
|
| 62 |
+
|
| 63 |
actionability_result = evaluate_clinical_actionability([response_data])
|
| 64 |
actionability_scores.append(actionability_result[0]['overall_score'])
|
| 65 |
+
|
| 66 |
evidence_result = evaluate_clinical_evidence([response_data])
|
| 67 |
evidence_scores.append(evidence_result[0]['overall_score'])
|
| 68 |
+
|
| 69 |
# 記錄詳細結果
|
| 70 |
results["detailed_results"].append({
|
| 71 |
"query": query,
|
|
|
|
| 76 |
"actionability": actionability_result[0],
|
| 77 |
"evidence": evidence_result[0]
|
| 78 |
})
|
| 79 |
+
|
| 80 |
# 計算平均指標
|
| 81 |
results["metrics"] = {
|
| 82 |
"average_latency": sum(total_latencies) / len(total_latencies),
|
|
|
|
| 86 |
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
| 87 |
"average_evidence_score": sum(evidence_scores) / len(evidence_scores)
|
| 88 |
}
|
| 89 |
+
|
| 90 |
return results
|
| 91 |
```
|
| 92 |
|
|
|
|
| 95 |
## 📈 評估結果分析框架
|
| 96 |
|
| 97 |
### 統計分析
|
| 98 |
+
|
| 99 |
```python
|
| 100 |
def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
|
| 101 |
"""比較三個模型的評估結果"""
|
| 102 |
+
|
| 103 |
models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
|
| 104 |
metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
|
| 105 |
+
|
| 106 |
comparison = {}
|
| 107 |
+
|
| 108 |
for metric in metrics:
|
| 109 |
comparison[metric] = {
|
| 110 |
models[0]: results_A['metrics'][f'average_{metric}'],
|
| 111 |
models[1]: results_B['metrics'][f'average_{metric}'],
|
| 112 |
models[2]: results_C['metrics'][f'average_{metric}']
|
| 113 |
}
|
| 114 |
+
|
| 115 |
# 計算相對改進
|
| 116 |
baseline = comparison[metric][models[0]]
|
| 117 |
rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
|
| 118 |
+
|
| 119 |
comparison[metric]['rag_improvement_percent'] = rag_improvement
|
| 120 |
+
|
| 121 |
return comparison
|
| 122 |
```
|
| 123 |
|
| 124 |
### 報告生成
|
| 125 |
+
|
| 126 |
```python
|
| 127 |
def generate_evaluation_report(comparison_results: Dict) -> str:
|
| 128 |
"""生成評估報告"""
|
| 129 |
+
|
| 130 |
report = f"""
|
| 131 |
# OnCall.ai 系統評估報告
|
| 132 |
+
|
| 133 |
## 評估摘要
|
| 134 |
+
|
| 135 |
| 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
|
| 136 |
|------|-----------|-----------|------------|----------|
|
| 137 |
| 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
|
|
|
|
| 140 |
| 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
|
| 141 |
| 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
|
| 142 |
| 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
|
| 143 |
+
|
| 144 |
"""
|
| 145 |
+
|
| 146 |
return report
|
| 147 |
```
|
| 148 |
|
|
|
|
| 151 |
## 🔧 實驗執行步驟
|
| 152 |
|
| 153 |
### 1. 環境準備
|
| 154 |
+
|
| 155 |
```bash
|
| 156 |
# 設置 HuggingFace token(用於 Inference Providers)
|
| 157 |
export HF_TOKEN=your_huggingface_token
|
|
|
|
| 161 |
```
|
| 162 |
|
| 163 |
### 2. 實驗執行腳本框架
|
| 164 |
+
|
| 165 |
```python
|
| 166 |
# evaluation/run_evaluation.py
|
| 167 |
def main():
|
| 168 |
"""主要評估執行函數"""
|
| 169 |
+
|
| 170 |
# 加載測試用例
|
| 171 |
test_cases = MEDICAL_TEST_CASES
|
| 172 |
+
|
| 173 |
# 實驗 A: YanBo 系統評估
|
| 174 |
print("🔬 開始實驗 A: YanBo 系統評估")
|
| 175 |
results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
|
| 176 |
+
results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
|
| 177 |
results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
| 178 |
+
|
| 179 |
# 分析和報告
|
| 180 |
comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
|
| 181 |
report_A = generate_evaluation_report(comparison_A)
|
| 182 |
+
|
| 183 |
# 保存結果
|
| 184 |
save_results("evaluation/results/yanbo_evaluation.json", {
|
| 185 |
"comparison": comparison_A,
|
| 186 |
"detailed_results": [results_med42_direct, results_general_rag, results_openbio]
|
| 187 |
})
|
| 188 |
+
|
| 189 |
print("✅ 實驗 A 完成,結果已保存")
|
| 190 |
+
|
| 191 |
# 實驗 B: Jeff 系統評估
|
| 192 |
print("🔬 開始實驗 B: Jeff 系統評估")
|
| 193 |
results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
|
| 194 |
results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
|
| 195 |
results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
| 196 |
+
|
| 197 |
# 分析和報告
|
| 198 |
comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
|
| 199 |
report_B = generate_evaluation_report(comparison_B)
|
| 200 |
+
|
| 201 |
# 保存結果
|
| 202 |
save_results("evaluation/results/jeff_evaluation.json", {
|
| 203 |
"comparison": comparison_B,
|
| 204 |
"detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
|
| 205 |
})
|
| 206 |
+
|
| 207 |
print("✅ 實驗 B 完成,結果已保存")
|
| 208 |
|
| 209 |
if __name__ == "__main__":
|
|
|
|
| 211 |
```
|
| 212 |
|
| 213 |
### 3. 預期評估時間
|
| 214 |
+
|
| 215 |
```
|
| 216 |
總評估時間估算:
|
| 217 |
├── 每個查詢處理時間:~30秒(包含LLM評估)
|
|
|
|
| 225 |
## 📊 評估成功標準
|
| 226 |
|
| 227 |
### 系統性能目標
|
| 228 |
+
|
| 229 |
```
|
| 230 |
✅ 達標條件:
|
| 231 |
1. 總處理時長 ≤ 30秒
|
| 232 |
+
2. 條件抽取成功率 ≥ 80%
|
| 233 |
3. 檢索相關性 ≥ 0.2
|
| 234 |
4. 檢索覆蓋率 ≥ 60%
|
| 235 |
5. 臨床可操作性 ≥ 7.0/10
|
|
|
|
| 241 |
```
|
| 242 |
|
| 243 |
### 比較分析重點
|
| 244 |
+
|
| 245 |
```
|
| 246 |
重點分析維度:
|
| 247 |
├── RAG 對處理時間的影響(可能增加延遲)
|
|
|
|
| 255 |
## 🛠️ 實施建議
|
| 256 |
|
| 257 |
### 分階段實施
|
| 258 |
+
|
| 259 |
```
|
| 260 |
階段1: 基礎指標實現(1-4項)
|
| 261 |
├── 利用現有 app.py 中的時間測量
|
|
|
|
| 277 |
```
|
| 278 |
|
| 279 |
### 實施注意事項
|
| 280 |
+
|
| 281 |
```
|
| 282 |
⚠️ 重要提醒:
|
| 283 |
1. 所有評估代碼應獨立於現有系統,避免影響正常運行
|
|
|
|
| 290 |
---
|
| 291 |
|
| 292 |
**評估指南完成。請根據此指南實施評估實驗。**
|
| 293 |
+
|
| 294 |
+
## Phase 1: Initial Assessment
|
| 295 |
+
|
| 296 |
+
### Step 1.1 - 分析您的說明
|
| 297 |
+
|
| 298 |
+
我重新理解了您的意思!
|
| 299 |
+
|
| 300 |
+
### Step 1.2 - 無不清楚的需求
|
| 301 |
+
|
| 302 |
+
### Step 1.3 - 確認理解(中文)
|
| 303 |
+
|
| 304 |
+
完全明白了!您的評估架構是:
|
| 305 |
+
|
| 306 |
+
## 🎯 **評估指標的測試層級**
|
| 307 |
+
|
| 308 |
+
### **單獨測試 Retrieval(指標 2, 3, 4)**
|
| 309 |
+
|
| 310 |
+
```python
|
| 311 |
+
# 只測試 RAG 系統,因為只有 RAG 系統有這些組件:
|
| 312 |
+
retrieval_only_metrics = [
|
| 313 |
+
"條件抽取成功率", # 只有您的系統有 user_prompt.py
|
| 314 |
+
"檢索相關性", # 只有 RAG 系統有檢索結果
|
| 315 |
+
"檢索覆蓋率" # 只有 RAG 系統有檢索→生成的映射
|
| 316 |
+
]
|
| 317 |
+
|
| 318 |
+
# 測試對象:
|
| 319 |
+
- Med42-70B_general_RAG(您的系統)✅
|
| 320 |
+
- Med42-70B_customized_RAG(Jeff系統)✅
|
| 321 |
+
- Med42-70B_direct(無RAG)❌ 沒有檢索組件
|
| 322 |
+
- OpenBioLLM-70B(無RAG)❌ 沒有檢索組件
|
| 323 |
+
```
|
| 324 |
+
|
| 325 |
+
### **三模型一起測試(指標 1, 5, 6)**
|
| 326 |
+
|
| 327 |
+
```python
|
| 328 |
+
# 所有模型都能測試的指標:
|
| 329 |
+
universal_metrics = [
|
| 330 |
+
"總處理時長", # 所有模型都有響應時間
|
| 331 |
+
"臨床可操作性", # Llama3-70B 評估所有模型的輸��
|
| 332 |
+
"臨床證據評分" # Llama3-70B 評估所有模型的輸出
|
| 333 |
+
]
|
| 334 |
+
|
| 335 |
+
# 測試對象:
|
| 336 |
+
- Med42-70B_direct ✅
|
| 337 |
+
- Med42-70B_general_RAG ✅
|
| 338 |
+
- OpenBioLLM-70B ✅
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
## 📊 **分層評估策略**
|
| 342 |
+
|
| 343 |
+
### **階段 1:RAG 系統內部評估**
|
| 344 |
+
|
| 345 |
+
```python
|
| 346 |
+
# 只測試有 RAG 的系統
|
| 347 |
+
rag_systems = ["Med42-70B_general_RAG", "Med42-70B_customized_RAG"]
|
| 348 |
+
|
| 349 |
+
# 測試 RAG 特有指標
|
| 350 |
+
for system in rag_systems:
|
| 351 |
+
evaluate_retrieval_metrics(system) # 指標 2, 3, 4
|
| 352 |
+
```
|
| 353 |
+
|
| 354 |
+
### **階段 2:全模型比較評估**
|
| 355 |
+
|
| 356 |
+
```python
|
| 357 |
+
# 測試所有三個模型
|
| 358 |
+
all_systems = ["Med42-70B_direct", "Med42-70B_general_RAG", "OpenBioLLM-70B"]
|
| 359 |
+
|
| 360 |
+
# 測試通用指標
|
| 361 |
+
for system in all_systems:
|
| 362 |
+
evaluate_universal_metrics(system) # 指標 1, 5, 6
|
| 363 |
+
```
|
| 364 |
+
|
| 365 |
+
### **階段 3:綜合分析**
|
| 366 |
+
|
| 367 |
+
```python
|
| 368 |
+
# 合併兩階段結果,生成完整報告
|
| 369 |
+
combine_evaluation_results()
|
| 370 |
+
```
|
| 371 |
+
|
| 372 |
+
### Step 1.4 - 無視覺修改確認 ✅
|
| 373 |
+
|
| 374 |
+
此為評估策略討論,無涉及代碼修改。
|
| 375 |
+
|
| 376 |
+
**您的理解完全正確!RAG 特有的指標只能在 RAG 系統內部測試,而通用指標可以跨所有模型比較。這樣的分層評估策略非常合理!**
|
| 377 |
+
|
| 378 |
+
---
|
| 379 |
+
|
| 380 |
+
## 📊 第七個評估指標(YanBo系統特有)
|
| 381 |
+
|
| 382 |
+
### 7. 多層級 Fallback 效率(早期攔截率)
|
| 383 |
+
|
| 384 |
+
**定義:** 系統通過多層級 Fallback 機制在早期層級成功處理查詢的效率
|
| 385 |
+
|
| 386 |
+
**測量位置:** `src/user_prompt.py` 的 `extract_condition_keywords` 多層級處理邏輯
|
| 387 |
+
|
| 388 |
+
**計算公式:**
|
| 389 |
+
```
|
| 390 |
+
Early_Interception_Rate = (Level1_Success + Level2_Success) / Total_Queries
|
| 391 |
+
|
| 392 |
+
其中:
|
| 393 |
+
- Level1_Success = 在預定義映射中直接找到條件的查詢數
|
| 394 |
+
- Level2_Success = 通過LLM抽取成功的查詢數
|
| 395 |
+
- Total_Queries = 測試查詢總數
|
| 396 |
+
|
| 397 |
+
時間節省效果:
|
| 398 |
+
Time_Savings = (Late_Avg_Time - Early_Avg_Time) / Late_Avg_Time
|
| 399 |
+
|
| 400 |
+
早期攔截效率:
|
| 401 |
+
Efficiency_Score = Early_Interception_Rate × (1 + Time_Savings)
|
| 402 |
+
```
|
| 403 |
+
|
| 404 |
+
**ASCII 流程圖:**
|
| 405 |
+
```
|
| 406 |
+
多層級 Fallback 效率示意圖:
|
| 407 |
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
| 408 |
+
│ 用戶查詢 │───▶│ Level 1 │───▶│ 直接成功 │
|
| 409 |
+
│ "胸痛診斷" │ │ 預定義映射 │ │ 35% (快) │
|
| 410 |
+
└─────────────┘ └─────────────┘ └─────────────┘
|
| 411 |
+
│
|
| 412 |
+
▼ (失敗)
|
| 413 |
+
┌─────────────┐ ┌─────────────┐
|
| 414 |
+
│ Level 2 │───▶│ LLM抽取成功 │
|
| 415 |
+
│ LLM 條件抽取│ │ 40% (中等) │
|
| 416 |
+
└─────────────┘ └─────────────┘
|
| 417 |
+
│
|
| 418 |
+
▼ (失敗)
|
| 419 |
+
┌─────────────┐ ┌─────────────┐
|
| 420 |
+
│ Level 3-5 │───▶│ 後備成功 │
|
| 421 |
+
│ 後續層級 │ │ 20% (慢) │
|
| 422 |
+
└─────────────┘ └─────────────┘
|
| 423 |
+
│
|
| 424 |
+
▼ (失敗)
|
| 425 |
+
┌─────────────┐
|
| 426 |
+
│ 完全失敗 │
|
| 427 |
+
│ 5% (錯誤) │
|
| 428 |
+
└─────────────┘
|
| 429 |
+
|
| 430 |
+
早期攔截率 = (35% + 40%) = 75% ✅ 目標 > 70%
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
**實現框架:**
|
| 434 |
+
```python
|
| 435 |
+
# 基於 user_prompt.py 的多層級處理邏輯
|
| 436 |
+
def evaluate_early_interception_efficiency(test_queries: List[str]) -> Dict[str, float]:
|
| 437 |
+
"""評估早期攔截率 - YanBo系統核心優勢"""
|
| 438 |
+
|
| 439 |
+
level1_success = 0 # Level 1: 預定義映射成功
|
| 440 |
+
level2_success = 0 # Level 2: LLM 抽取成功
|
| 441 |
+
later_success = 0 # Level 3-5: 後續層級成功
|
| 442 |
+
total_failures = 0 # 完全失敗
|
| 443 |
+
|
| 444 |
+
early_times = [] # 早期成功的處理時間
|
| 445 |
+
late_times = [] # 後期成功的處理時間
|
| 446 |
+
|
| 447 |
+
for query in test_queries:
|
| 448 |
+
# 追蹤每個查詢的成功層級和時間
|
| 449 |
+
success_level, processing_time = track_query_success_level(query)
|
| 450 |
+
|
| 451 |
+
if success_level == 1:
|
| 452 |
+
level1_success += 1
|
| 453 |
+
early_times.append(processing_time)
|
| 454 |
+
elif success_level == 2:
|
| 455 |
+
level2_success += 1
|
| 456 |
+
early_times.append(processing_time)
|
| 457 |
+
elif success_level in [3, 4, 5]:
|
| 458 |
+
later_success += 1
|
| 459 |
+
late_times.append(processing_time)
|
| 460 |
+
else:
|
| 461 |
+
total_failures += 1
|
| 462 |
+
|
| 463 |
+
total_queries = len(test_queries)
|
| 464 |
+
early_success_count = level1_success + level2_success
|
| 465 |
+
|
| 466 |
+
# 計算時間節省效果
|
| 467 |
+
early_avg_time = sum(early_times) / len(early_times) if early_times else 0
|
| 468 |
+
late_avg_time = sum(late_times) / len(late_times) if late_times else 0
|
| 469 |
+
time_savings = (late_avg_time - early_avg_time) / late_avg_time if late_avg_time > 0 else 0
|
| 470 |
+
|
| 471 |
+
# 綜合效率分數
|
| 472 |
+
early_interception_rate = early_success_count / total_queries
|
| 473 |
+
efficiency_score = early_interception_rate * (1 + time_savings)
|
| 474 |
+
|
| 475 |
+
return {
|
| 476 |
+
# 核心指標
|
| 477 |
+
"early_interception_rate": early_interception_rate, # 早期攔截率
|
| 478 |
+
"level1_success_rate": level1_success / total_queries,
|
| 479 |
+
"level2_success_rate": level2_success / total_queries,
|
| 480 |
+
|
| 481 |
+
# 時間效率
|
| 482 |
+
"early_avg_time": early_avg_time,
|
| 483 |
+
"late_avg_time": late_avg_time,
|
| 484 |
+
"time_savings_rate": time_savings,
|
| 485 |
+
|
| 486 |
+
# 系統健康度
|
| 487 |
+
"total_success_rate": (total_queries - total_failures) / total_queries,
|
| 488 |
+
"miss_rate": total_failures / total_queries,
|
| 489 |
+
|
| 490 |
+
# 綜合效率
|
| 491 |
+
"overall_efficiency_score": efficiency_score,
|
| 492 |
+
|
| 493 |
+
# 詳細分布
|
| 494 |
+
"success_distribution": {
|
| 495 |
+
"level1": level1_success,
|
| 496 |
+
"level2": level2_success,
|
| 497 |
+
"later_levels": later_success,
|
| 498 |
+
"failures": total_failures
|
| 499 |
+
}
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
def track_query_success_level(query: str) -> Tuple[int, float]:
|
| 503 |
+
"""
|
| 504 |
+
追蹤查詢在哪個層級成功並記錄時間
|
| 505 |
+
|
| 506 |
+
Args:
|
| 507 |
+
query: 測試查詢
|
| 508 |
+
|
| 509 |
+
Returns:
|
| 510 |
+
Tuple of (success_level, processing_time)
|
| 511 |
+
"""
|
| 512 |
+
start_time = time.time()
|
| 513 |
+
|
| 514 |
+
# 模擬 user_prompt.py 的層級處理邏輯
|
| 515 |
+
try:
|
| 516 |
+
# Level 1: 檢查預定義映射
|
| 517 |
+
if check_predefined_mapping(query):
|
| 518 |
+
processing_time = time.time() - start_time
|
| 519 |
+
return (1, processing_time)
|
| 520 |
+
|
| 521 |
+
# Level 2: LLM 條件抽取
|
| 522 |
+
llm_result = llm_client.analyze_medical_query(query)
|
| 523 |
+
if llm_result.get('extracted_condition'):
|
| 524 |
+
processing_time = time.time() - start_time
|
| 525 |
+
return (2, processing_time)
|
| 526 |
+
|
| 527 |
+
# Level 3: 語義搜索
|
| 528 |
+
semantic_result = semantic_search_fallback(query)
|
| 529 |
+
if semantic_result:
|
| 530 |
+
processing_time = time.time() - start_time
|
| 531 |
+
return (3, processing_time)
|
| 532 |
+
|
| 533 |
+
# Level 4: 醫學驗證
|
| 534 |
+
validation_result = validate_medical_query(query)
|
| 535 |
+
if not validation_result: # 驗證通過
|
| 536 |
+
processing_time = time.time() - start_time
|
| 537 |
+
return (4, processing_time)
|
| 538 |
+
|
| 539 |
+
# Level 5: 通用搜索
|
| 540 |
+
generic_result = generic_medical_search(query)
|
| 541 |
+
if generic_result:
|
| 542 |
+
processing_time = time.time() - start_time
|
| 543 |
+
return (5, processing_time)
|
| 544 |
+
|
| 545 |
+
# 完全失敗
|
| 546 |
+
processing_time = time.time() - start_time
|
| 547 |
+
return (0, processing_time)
|
| 548 |
+
|
| 549 |
+
except Exception as e:
|
| 550 |
+
processing_time = time.time() - start_time
|
| 551 |
+
return (0, processing_time)
|
| 552 |
+
|
| 553 |
+
def check_predefined_mapping(query: str) -> bool:
|
| 554 |
+
"""檢查查詢是否在預定義映射中"""
|
| 555 |
+
# 基於 medical_conditions.py 的 CONDITION_KEYWORD_MAPPING
|
| 556 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING
|
| 557 |
+
|
| 558 |
+
query_lower = query.lower()
|
| 559 |
+
for condition, keywords in CONDITION_KEYWORD_MAPPING.items():
|
| 560 |
+
if any(keyword.lower() in query_lower for keyword in keywords):
|
| 561 |
+
return True
|
| 562 |
+
return False
|
| 563 |
+
```
|
| 564 |
+
|
| 565 |
+
**目標閾值:**
|
| 566 |
+
- 早期攔截率 ≥ 70%(前兩層解決)
|
| 567 |
+
- 時間節省率 ≥ 60%(早期比後期快)
|
| 568 |
+
- 總成功率 ≥ 95%(漏接率 < 5%)
|
| 569 |
+
|
| 570 |
+
---
|
| 571 |
+
|
| 572 |
+
## 🧪 更新的完整評估流程
|
| 573 |
+
|
| 574 |
+
### 測試用例設計
|
| 575 |
+
```python
|
| 576 |
+
# 基於 readme.md 中的範例查詢設計測試集
|
| 577 |
+
MEDICAL_TEST_CASES = [
|
| 578 |
+
# Level 1 預期成功(預定義映射)
|
| 579 |
+
"患者胸痛怎麼處理?",
|
| 580 |
+
"心肌梗死的診斷方法?",
|
| 581 |
+
|
| 582 |
+
# Level 2 預期成功(LLM抽取)
|
| 583 |
+
"60歲男性,有高血壓病史,突發胸痛。可能的原因和評估方法?",
|
| 584 |
+
"30歲患者突發嚴重頭痛和頸部僵硬。鑑別診斷?",
|
| 585 |
+
|
| 586 |
+
# Level 3+ 預期成功(複雜查詢)
|
| 587 |
+
"患者急性呼吸困難和腿部水腫。應該考慮什麼?",
|
| 588 |
+
"20歲女性,無病史,突發癲癇。可能原因和完整處理流程?",
|
| 589 |
+
|
| 590 |
+
# 邊界測試
|
| 591 |
+
"疑似急性出血性中風。下一步處理?"
|
| 592 |
+
]
|
| 593 |
+
```
|
| 594 |
+
|
| 595 |
+
### 更新的評估執行流程
|
| 596 |
+
```python
|
| 597 |
+
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
| 598 |
+
"""執行完整的七項指標評估"""
|
| 599 |
+
|
| 600 |
+
results = {
|
| 601 |
+
"model": model_name,
|
| 602 |
+
"metrics": {},
|
| 603 |
+
"detailed_results": []
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
total_latencies = []
|
| 607 |
+
extraction_successes = []
|
| 608 |
+
relevance_scores = []
|
| 609 |
+
coverage_scores = []
|
| 610 |
+
actionability_scores = []
|
| 611 |
+
evidence_scores = []
|
| 612 |
+
fallback_efficiency_scores = [] # 新增
|
| 613 |
+
|
| 614 |
+
for query in test_cases:
|
| 615 |
+
# 運行模型並測量所有指標
|
| 616 |
+
|
| 617 |
+
# 1. 總處理時長
|
| 618 |
+
latency_result = measure_total_latency(query)
|
| 619 |
+
total_latencies.append(latency_result['total_latency'])
|
| 620 |
+
|
| 621 |
+
# 2. 條件抽取成功率
|
| 622 |
+
extraction_result = evaluate_condition_extraction([query])
|
| 623 |
+
extraction_successes.append(extraction_result['success_rate'])
|
| 624 |
+
|
| 625 |
+
# 3 & 4. 檢索相關性和覆蓋率
|
| 626 |
+
retrieval_results = get_retrieval_results(query)
|
| 627 |
+
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
| 628 |
+
relevance_scores.append(relevance_result['average_relevance'])
|
| 629 |
+
|
| 630 |
+
generated_advice = get_generated_advice(query, retrieval_results)
|
| 631 |
+
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
| 632 |
+
coverage_scores.append(coverage_result['coverage'])
|
| 633 |
+
|
| 634 |
+
# 5 & 6. LLM 評估
|
| 635 |
+
response_data = {
|
| 636 |
+
'query': query,
|
| 637 |
+
'advice': generated_advice,
|
| 638 |
+
'retrieval_results': retrieval_results
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
actionability_result = evaluate_clinical_actionability([response_data])
|
| 642 |
+
actionability_scores.append(actionability_result[0]['overall_score'])
|
| 643 |
+
|
| 644 |
+
evidence_result = evaluate_clinical_evidence([response_data])
|
| 645 |
+
evidence_scores.append(evidence_result[0]['overall_score'])
|
| 646 |
+
|
| 647 |
+
# 7. 多層級 Fallback 效率(新增)
|
| 648 |
+
if model_name == "Med42-70B_general_RAG": # 只對YanBo系統測量
|
| 649 |
+
fallback_result = evaluate_early_interception_efficiency([query])
|
| 650 |
+
fallback_efficiency_scores.append(fallback_result['overall_efficiency_score'])
|
| 651 |
+
|
| 652 |
+
# 記錄詳細結果...
|
| 653 |
+
|
| 654 |
+
# 計算平均指標
|
| 655 |
+
results["metrics"] = {
|
| 656 |
+
"average_latency": sum(total_latencies) / len(total_latencies),
|
| 657 |
+
"extraction_success_rate": sum(extraction_successes) / len(extraction_successes),
|
| 658 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
| 659 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
| 660 |
+
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
| 661 |
+
"average_evidence_score": sum(evidence_scores) / len(evidence_scores),
|
| 662 |
+
# 新增指標(只對RAG系統有效)
|
| 663 |
+
"average_fallback_efficiency": sum(fallback_efficiency_scores) / len(fallback_efficiency_scores) if fallback_efficiency_scores else 0.0
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
return results
|
| 667 |
+
```
|
| 668 |
+
|
| 669 |
+
---
|
| 670 |
+
|
| 671 |
+
## 📊 更新的系統成功標準
|
| 672 |
+
|
| 673 |
+
### 系統性能目標(七個指標)
|
| 674 |
+
```
|
| 675 |
+
✅ 達標條件:
|
| 676 |
+
1. 總處理時長 ≤ 30秒
|
| 677 |
+
2. 條件抽取成功率 ≥ 80%
|
| 678 |
+
3. 檢索相關性 ≥ 0.25(基於實際醫學數據)
|
| 679 |
+
4. 檢索覆蓋率 ≥ 60%
|
| 680 |
+
5. 臨床可操作性 ≥ 7.0/10
|
| 681 |
+
6. 臨床證據評分 ≥ 7.5/10
|
| 682 |
+
7. 早期攔截率 ≥ 70%(多層級 Fallback 效率)
|
| 683 |
+
|
| 684 |
+
🎯 YanBo RAG 系統成功標準:
|
| 685 |
+
- RAG增強版在 5-7 項指標上優於基線 Med42-70B
|
| 686 |
+
- 早期攔截率體現多層級設計的優勢
|
| 687 |
+
- 整體提升幅度 ≥ 15%
|
| 688 |
+
```
|
| 689 |
+
|
| 690 |
+
### YanBo 系統特有優勢分析
|
| 691 |
+
```
|
| 692 |
+
多層級 Fallback 優勢:
|
| 693 |
+
├── 漏接防護:通過多層級降低失敗率至 < 5%
|
| 694 |
+
├── 時間優化:70%+ 查詢在前兩層快速解決
|
| 695 |
+
├── 系統穩定:即使某層級失敗,後續層級提供保障
|
| 696 |
+
└── 智能分流:不同複雜度查詢自動分配到合適層級
|
| 697 |
+
```
|
| 698 |
+
|
| 699 |
+
---
|
| 700 |
+
|
| 701 |
+
**第七個指標已添加完成,專注測量您的多層級 Fallback 系統的早期攔截效率和時間節省效果。**
|
evaluation/{evaluation_instruction_customization.md → old/evaluation_instruction_customization.md}
RENAMED
|
File without changes
|
evaluation/old/extraction_evaluator.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Condition Extraction Evaluator (Metric 2)
|
| 4 |
+
============================================================
|
| 5 |
+
|
| 6 |
+
Evaluates condition extraction success rate from user_prompt.py
|
| 7 |
+
Pure automatic evaluation based on extract_condition_keywords() results
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import re
|
| 20 |
+
|
| 21 |
+
# Add project path
|
| 22 |
+
current_dir = Path(__file__).parent
|
| 23 |
+
project_root = current_dir.parent
|
| 24 |
+
src_dir = project_root / "src"
|
| 25 |
+
sys.path.insert(0, str(src_dir))
|
| 26 |
+
|
| 27 |
+
# Import existing system components
|
| 28 |
+
try:
|
| 29 |
+
from user_prompt import UserPromptProcessor
|
| 30 |
+
from retrieval import BasicRetrievalSystem
|
| 31 |
+
from llm_clients import llm_Med42_70BClient
|
| 32 |
+
except ImportError as e:
|
| 33 |
+
print(f"❌ Import failed: {e}")
|
| 34 |
+
print("Please ensure running from project root directory")
|
| 35 |
+
sys.exit(1)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ExtractionEvaluator:
|
| 39 |
+
"""Condition extraction success rate evaluator - pure automatic evaluation"""
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
"""Initialize system components for extraction testing"""
|
| 43 |
+
print("🔧 Initializing Extraction Evaluator...")
|
| 44 |
+
|
| 45 |
+
# Initialize required components for extraction
|
| 46 |
+
self.llm_client = llm_Med42_70BClient()
|
| 47 |
+
self.retrieval_system = BasicRetrievalSystem()
|
| 48 |
+
self.user_prompt_processor = UserPromptProcessor(
|
| 49 |
+
llm_client=self.llm_client,
|
| 50 |
+
retrieval_system=self.retrieval_system
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Results accumulation
|
| 54 |
+
self.extraction_results = []
|
| 55 |
+
|
| 56 |
+
print("✅ Extraction Evaluator initialization complete")
|
| 57 |
+
|
| 58 |
+
def evaluate_single_extraction(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
| 59 |
+
"""
|
| 60 |
+
Evaluate condition extraction success for a single query
|
| 61 |
+
|
| 62 |
+
Tests user_prompt.py extract_condition_keywords() method
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
query: Medical query to test
|
| 66 |
+
category: Query category (diagnosis/treatment/mixed)
|
| 67 |
+
"""
|
| 68 |
+
print(f"🔍 Testing extraction for: {query[:50]}...")
|
| 69 |
+
print(f"📋 Category: {category}")
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
# Call the actual extraction method from user_prompt.py
|
| 73 |
+
extraction_start = datetime.now()
|
| 74 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
| 75 |
+
extraction_time = (datetime.now() - extraction_start).total_seconds()
|
| 76 |
+
|
| 77 |
+
# Analyze extraction success
|
| 78 |
+
extracted_condition = condition_result.get('condition')
|
| 79 |
+
query_status = condition_result.get('query_status')
|
| 80 |
+
emergency_keywords = condition_result.get('emergency_keywords', [])
|
| 81 |
+
treatment_keywords = condition_result.get('treatment_keywords', [])
|
| 82 |
+
fallback_level = condition_result.get('fallback_level', 'unknown')
|
| 83 |
+
|
| 84 |
+
# Define success criteria
|
| 85 |
+
is_successful = (
|
| 86 |
+
extracted_condition and
|
| 87 |
+
extracted_condition.strip() and
|
| 88 |
+
extracted_condition != "unknown" and
|
| 89 |
+
query_status not in ['invalid_query', 'non_medical']
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
result = {
|
| 93 |
+
"query": query,
|
| 94 |
+
"category": category,
|
| 95 |
+
"extraction_success": is_successful,
|
| 96 |
+
"extraction_time": extraction_time,
|
| 97 |
+
"extracted_condition": extracted_condition,
|
| 98 |
+
"query_status": query_status,
|
| 99 |
+
"emergency_keywords": emergency_keywords,
|
| 100 |
+
"treatment_keywords": treatment_keywords,
|
| 101 |
+
"fallback_level": fallback_level,
|
| 102 |
+
"full_condition_result": condition_result,
|
| 103 |
+
"timestamp": datetime.now().isoformat()
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Store result
|
| 107 |
+
self.extraction_results.append(result)
|
| 108 |
+
|
| 109 |
+
print(f" ✅ Extraction: {'Success' if is_successful else 'Failed'}")
|
| 110 |
+
print(f" 📝 Condition: {extracted_condition}")
|
| 111 |
+
print(f" 🎯 Status: {query_status}")
|
| 112 |
+
print(f" ⏱️ Time: {extraction_time:.3f}s")
|
| 113 |
+
print(f" 🔄 Fallback Level: {fallback_level}")
|
| 114 |
+
|
| 115 |
+
return result
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
error_result = {
|
| 119 |
+
"query": query,
|
| 120 |
+
"category": category,
|
| 121 |
+
"extraction_success": False,
|
| 122 |
+
"extraction_time": 0.0,
|
| 123 |
+
"error": str(e),
|
| 124 |
+
"timestamp": datetime.now().isoformat()
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
self.extraction_results.append(error_result)
|
| 128 |
+
print(f" ❌ Extraction failed: {e}")
|
| 129 |
+
|
| 130 |
+
return error_result
|
| 131 |
+
|
| 132 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
| 133 |
+
"""Parse queries from file with category labels"""
|
| 134 |
+
print(f"📁 Reading queries from file: {filepath}")
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 138 |
+
content = f.read()
|
| 139 |
+
|
| 140 |
+
# Parse queries with category labels
|
| 141 |
+
queries_by_category = {
|
| 142 |
+
"diagnosis": [],
|
| 143 |
+
"treatment": [],
|
| 144 |
+
"mixed": []
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
lines = content.strip().split('\n')
|
| 148 |
+
|
| 149 |
+
for line in lines:
|
| 150 |
+
line = line.strip()
|
| 151 |
+
if not line:
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
# Parse format: "1.diagnosis: query text"
|
| 155 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
| 156 |
+
if match:
|
| 157 |
+
category_raw = match.group(1).lower()
|
| 158 |
+
query_text = match.group(2).strip()
|
| 159 |
+
|
| 160 |
+
# Normalize category name
|
| 161 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
| 162 |
+
category = 'mixed'
|
| 163 |
+
else:
|
| 164 |
+
category = category_raw
|
| 165 |
+
|
| 166 |
+
if category in queries_by_category and len(query_text) > 15:
|
| 167 |
+
queries_by_category[category].append({
|
| 168 |
+
"text": query_text,
|
| 169 |
+
"category": category
|
| 170 |
+
})
|
| 171 |
+
|
| 172 |
+
print(f"📋 Parsed queries by category:")
|
| 173 |
+
for category, category_queries in queries_by_category.items():
|
| 174 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
| 175 |
+
|
| 176 |
+
return queries_by_category
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print(f"❌ Failed to read file: {e}")
|
| 180 |
+
return {"error": f"Failed to read file: {e}"}
|
| 181 |
+
|
| 182 |
+
def calculate_extraction_statistics(self) -> Dict[str, Any]:
|
| 183 |
+
"""Calculate extraction success statistics by category"""
|
| 184 |
+
category_stats = {}
|
| 185 |
+
all_results = []
|
| 186 |
+
|
| 187 |
+
# Group results by category
|
| 188 |
+
results_by_category = {
|
| 189 |
+
"diagnosis": [],
|
| 190 |
+
"treatment": [],
|
| 191 |
+
"mixed": []
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
for result in self.extraction_results:
|
| 195 |
+
category = result.get('category', 'unknown')
|
| 196 |
+
if category in results_by_category:
|
| 197 |
+
results_by_category[category].append(result)
|
| 198 |
+
all_results.append(result)
|
| 199 |
+
|
| 200 |
+
# Calculate statistics for each category
|
| 201 |
+
for category, results in results_by_category.items():
|
| 202 |
+
if results:
|
| 203 |
+
successful = [r for r in results if r.get('extraction_success')]
|
| 204 |
+
success_rate = len(successful) / len(results)
|
| 205 |
+
avg_time = sum(r.get('extraction_time', 0) for r in results) / len(results)
|
| 206 |
+
|
| 207 |
+
category_stats[category] = {
|
| 208 |
+
"success_rate": success_rate,
|
| 209 |
+
"successful_count": len(successful),
|
| 210 |
+
"total_count": len(results),
|
| 211 |
+
"average_extraction_time": avg_time,
|
| 212 |
+
"fallback_levels": [r.get('fallback_level') for r in results]
|
| 213 |
+
}
|
| 214 |
+
else:
|
| 215 |
+
category_stats[category] = {
|
| 216 |
+
"success_rate": 0.0,
|
| 217 |
+
"successful_count": 0,
|
| 218 |
+
"total_count": 0,
|
| 219 |
+
"average_extraction_time": 0.0,
|
| 220 |
+
"fallback_levels": []
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
# Calculate overall statistics
|
| 224 |
+
if all_results:
|
| 225 |
+
overall_successful = [r for r in all_results if r.get('extraction_success')]
|
| 226 |
+
overall_stats = {
|
| 227 |
+
"success_rate": len(overall_successful) / len(all_results),
|
| 228 |
+
"successful_count": len(overall_successful),
|
| 229 |
+
"total_count": len(all_results),
|
| 230 |
+
"average_extraction_time": sum(r.get('extraction_time', 0) for r in all_results) / len(all_results),
|
| 231 |
+
"target_compliance": len(overall_successful) / len(all_results) >= 0.8
|
| 232 |
+
}
|
| 233 |
+
else:
|
| 234 |
+
overall_stats = {
|
| 235 |
+
"success_rate": 0.0,
|
| 236 |
+
"successful_count": 0,
|
| 237 |
+
"total_count": 0,
|
| 238 |
+
"average_extraction_time": 0.0,
|
| 239 |
+
"target_compliance": False
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
return {
|
| 243 |
+
"category_results": category_stats,
|
| 244 |
+
"overall_results": overall_stats,
|
| 245 |
+
"timestamp": datetime.now().isoformat()
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
def save_extraction_statistics(self, filename: str = None) -> str:
|
| 249 |
+
"""Save extraction statistics for chart generation"""
|
| 250 |
+
stats = self.calculate_extraction_statistics()
|
| 251 |
+
|
| 252 |
+
if filename is None:
|
| 253 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 254 |
+
filename = f"extraction_statistics_{timestamp}.json"
|
| 255 |
+
|
| 256 |
+
# Ensure results directory exists
|
| 257 |
+
results_dir = Path(__file__).parent / "results"
|
| 258 |
+
results_dir.mkdir(exist_ok=True)
|
| 259 |
+
|
| 260 |
+
filepath = results_dir / filename
|
| 261 |
+
|
| 262 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 263 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 264 |
+
|
| 265 |
+
print(f"📊 Extraction statistics saved to: {filepath}")
|
| 266 |
+
return str(filepath)
|
| 267 |
+
|
| 268 |
+
def save_extraction_details(self, filename: str = None) -> str:
|
| 269 |
+
"""Save detailed extraction results"""
|
| 270 |
+
if filename is None:
|
| 271 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 272 |
+
filename = f"extraction_details_{timestamp}.json"
|
| 273 |
+
|
| 274 |
+
# Ensure results directory exists
|
| 275 |
+
results_dir = Path(__file__).parent / "results"
|
| 276 |
+
results_dir.mkdir(exist_ok=True)
|
| 277 |
+
|
| 278 |
+
filepath = results_dir / filename
|
| 279 |
+
|
| 280 |
+
# Create comprehensive extraction data
|
| 281 |
+
extraction_data = {
|
| 282 |
+
"evaluation_metadata": {
|
| 283 |
+
"total_queries": len(self.extraction_results),
|
| 284 |
+
"timestamp": datetime.now().isoformat(),
|
| 285 |
+
"evaluator_type": "condition_extraction"
|
| 286 |
+
},
|
| 287 |
+
"extraction_results": self.extraction_results
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 291 |
+
json.dump(extraction_data, f, indent=2, ensure_ascii=False)
|
| 292 |
+
|
| 293 |
+
print(f"📝 Extraction details saved to: {filepath}")
|
| 294 |
+
return str(filepath)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# Independent execution interface
|
| 298 |
+
if __name__ == "__main__":
|
| 299 |
+
"""Independent extraction evaluation interface"""
|
| 300 |
+
|
| 301 |
+
print("🔍 OnCall.ai Extraction Evaluator - Condition Extraction Success Rate")
|
| 302 |
+
|
| 303 |
+
if len(sys.argv) > 1:
|
| 304 |
+
query_file = sys.argv[1]
|
| 305 |
+
else:
|
| 306 |
+
# Default to evaluation/pre_user_query_evaluate.txt
|
| 307 |
+
query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
|
| 308 |
+
|
| 309 |
+
if not os.path.exists(query_file):
|
| 310 |
+
print(f"❌ Query file not found: {query_file}")
|
| 311 |
+
print("Usage: python extraction_evaluator.py [query_file.txt]")
|
| 312 |
+
sys.exit(1)
|
| 313 |
+
|
| 314 |
+
# Initialize evaluator
|
| 315 |
+
evaluator = ExtractionEvaluator()
|
| 316 |
+
|
| 317 |
+
# Parse queries from file
|
| 318 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
| 319 |
+
|
| 320 |
+
if "error" in queries_by_category:
|
| 321 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
| 322 |
+
sys.exit(1)
|
| 323 |
+
|
| 324 |
+
# Test extraction for each query
|
| 325 |
+
print(f"\n🧪 Condition Extraction Testing")
|
| 326 |
+
|
| 327 |
+
for category, queries in queries_by_category.items():
|
| 328 |
+
if not queries:
|
| 329 |
+
continue
|
| 330 |
+
|
| 331 |
+
print(f"\n📂 Testing {category.upper()} extraction:")
|
| 332 |
+
|
| 333 |
+
for i, query_info in enumerate(queries):
|
| 334 |
+
query_text = query_info['text']
|
| 335 |
+
|
| 336 |
+
# Test extraction
|
| 337 |
+
result = evaluator.evaluate_single_extraction(query_text, category)
|
| 338 |
+
|
| 339 |
+
# Pause between queries to avoid rate limits (if needed)
|
| 340 |
+
if i < len(queries) - 1:
|
| 341 |
+
print(f" ⏳ Pausing 3s before next query...")
|
| 342 |
+
import time
|
| 343 |
+
time.sleep(3)
|
| 344 |
+
|
| 345 |
+
# Pause between categories
|
| 346 |
+
if category != list(queries_by_category.keys())[-1]:
|
| 347 |
+
print(f"\n⏳ Pausing 5s before next category...")
|
| 348 |
+
import time
|
| 349 |
+
time.sleep(5)
|
| 350 |
+
|
| 351 |
+
# Generate and save results
|
| 352 |
+
print(f"\n📊 Generating extraction analysis...")
|
| 353 |
+
|
| 354 |
+
# Save statistics and details
|
| 355 |
+
stats_path = evaluator.save_extraction_statistics()
|
| 356 |
+
details_path = evaluator.save_extraction_details()
|
| 357 |
+
|
| 358 |
+
# Print final summary
|
| 359 |
+
stats = evaluator.calculate_extraction_statistics()
|
| 360 |
+
category_results = stats['category_results']
|
| 361 |
+
overall_results = stats['overall_results']
|
| 362 |
+
|
| 363 |
+
print(f"\n📊 === EXTRACTION EVALUATION SUMMARY ===")
|
| 364 |
+
print(f"Overall Performance:")
|
| 365 |
+
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
| 366 |
+
print(f" Successful Extractions: {overall_results['successful_count']}/{overall_results['total_count']}")
|
| 367 |
+
print(f" Average Extraction Time: {overall_results['average_extraction_time']:.3f}s")
|
| 368 |
+
print(f" 80% Target Compliance: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
|
| 369 |
+
|
| 370 |
+
print(f"\nCategory Breakdown:")
|
| 371 |
+
for category, cat_stats in category_results.items():
|
| 372 |
+
if cat_stats['total_count'] > 0:
|
| 373 |
+
print(f" {category.capitalize()}: {cat_stats['success_rate']:.1%} "
|
| 374 |
+
f"({cat_stats['successful_count']}/{cat_stats['total_count']}) "
|
| 375 |
+
f"[{cat_stats['average_extraction_time']:.3f}s avg]")
|
| 376 |
+
|
| 377 |
+
print(f"\n✅ Extraction evaluation complete!")
|
| 378 |
+
print(f"📊 Statistics: {stats_path}")
|
| 379 |
+
print(f"📝 Details: {details_path}")
|
evaluation/old/relevance_evaluator.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Retrieval Relevance Evaluator (Metric 3)
|
| 4 |
+
===========================================================
|
| 5 |
+
|
| 6 |
+
Evaluates retrieval relevance using cosine similarity from retrieval.py
|
| 7 |
+
Automatic evaluation based on existing similarity scores with optional LLM sampling
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import re
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
# Add project path
|
| 23 |
+
current_dir = Path(__file__).parent
|
| 24 |
+
project_root = current_dir.parent
|
| 25 |
+
src_dir = project_root / "src"
|
| 26 |
+
sys.path.insert(0, str(src_dir))
|
| 27 |
+
|
| 28 |
+
# Import existing system components
|
| 29 |
+
try:
|
| 30 |
+
from user_prompt import UserPromptProcessor
|
| 31 |
+
from retrieval import BasicRetrievalSystem
|
| 32 |
+
from llm_clients import llm_Med42_70BClient
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
print(f"❌ Import failed: {e}")
|
| 35 |
+
print("Please ensure running from project root directory")
|
| 36 |
+
sys.exit(1)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class RelevanceEvaluator:
|
| 40 |
+
"""Retrieval relevance evaluator using cosine similarity - automatic evaluation"""
|
| 41 |
+
|
| 42 |
+
def __init__(self):
|
| 43 |
+
"""Initialize system components for relevance testing"""
|
| 44 |
+
print("🔧 Initializing Relevance Evaluator...")
|
| 45 |
+
|
| 46 |
+
# Initialize required components
|
| 47 |
+
self.llm_client = llm_Med42_70BClient()
|
| 48 |
+
self.retrieval_system = BasicRetrievalSystem()
|
| 49 |
+
self.user_prompt_processor = UserPromptProcessor(
|
| 50 |
+
llm_client=self.llm_client,
|
| 51 |
+
retrieval_system=self.retrieval_system
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Results accumulation
|
| 55 |
+
self.relevance_results = []
|
| 56 |
+
|
| 57 |
+
print("✅ Relevance Evaluator initialization complete")
|
| 58 |
+
|
| 59 |
+
def evaluate_single_relevance(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
| 60 |
+
"""
|
| 61 |
+
Evaluate retrieval relevance for a single query
|
| 62 |
+
|
| 63 |
+
Uses existing cosine similarity scores from retrieval.py
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
query: Medical query to test
|
| 67 |
+
category: Query category (diagnosis/treatment/mixed)
|
| 68 |
+
"""
|
| 69 |
+
print(f"🔍 Testing relevance for: {query[:50]}...")
|
| 70 |
+
print(f"📋 Category: {category}")
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
# Step 1: Extract condition for search query construction
|
| 74 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
| 75 |
+
|
| 76 |
+
# Step 2: Perform retrieval (same as latency_evaluator.py)
|
| 77 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
| 78 |
+
if not search_query:
|
| 79 |
+
search_query = condition_result.get('condition', query)
|
| 80 |
+
|
| 81 |
+
retrieval_start = datetime.now()
|
| 82 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
| 83 |
+
retrieval_time = (datetime.now() - retrieval_start).total_seconds()
|
| 84 |
+
|
| 85 |
+
# Step 3: Extract similarity scores from retrieval results
|
| 86 |
+
processed_results = retrieval_results.get('processed_results', [])
|
| 87 |
+
|
| 88 |
+
if not processed_results:
|
| 89 |
+
result = {
|
| 90 |
+
"query": query,
|
| 91 |
+
"category": category,
|
| 92 |
+
"search_query": search_query,
|
| 93 |
+
"retrieval_success": False,
|
| 94 |
+
"average_relevance": 0.0,
|
| 95 |
+
"relevance_scores": [],
|
| 96 |
+
"retrieved_count": 0,
|
| 97 |
+
"retrieval_time": retrieval_time,
|
| 98 |
+
"error": "No retrieval results",
|
| 99 |
+
"timestamp": datetime.now().isoformat()
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
self.relevance_results.append(result)
|
| 103 |
+
print(f" ❌ No retrieval results found")
|
| 104 |
+
return result
|
| 105 |
+
|
| 106 |
+
# Extract cosine similarity scores
|
| 107 |
+
similarity_scores = []
|
| 108 |
+
retrieval_details = []
|
| 109 |
+
|
| 110 |
+
for i, doc_result in enumerate(processed_results):
|
| 111 |
+
# Get similarity score (may be stored as 'distance', 'similarity_score', or 'score')
|
| 112 |
+
similarity = (
|
| 113 |
+
doc_result.get('distance', 0.0) or
|
| 114 |
+
doc_result.get('similarity_score', 0.0) or
|
| 115 |
+
doc_result.get('score', 0.0)
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
similarity_scores.append(similarity)
|
| 119 |
+
|
| 120 |
+
retrieval_details.append({
|
| 121 |
+
"doc_index": i,
|
| 122 |
+
"similarity_score": similarity,
|
| 123 |
+
"content_snippet": doc_result.get('content', '')[:100] + "...",
|
| 124 |
+
"doc_type": doc_result.get('type', 'unknown'),
|
| 125 |
+
"source": doc_result.get('source', 'unknown')
|
| 126 |
+
})
|
| 127 |
+
|
| 128 |
+
# Calculate relevance metrics
|
| 129 |
+
average_relevance = sum(similarity_scores) / len(similarity_scores)
|
| 130 |
+
max_relevance = max(similarity_scores)
|
| 131 |
+
min_relevance = min(similarity_scores)
|
| 132 |
+
|
| 133 |
+
# Count high-relevance results (threshold: 0.2 based on evaluation_instruction.md)
|
| 134 |
+
high_relevance_count = sum(1 for score in similarity_scores if score >= 0.2)
|
| 135 |
+
high_relevance_ratio = high_relevance_count / len(similarity_scores)
|
| 136 |
+
|
| 137 |
+
result = {
|
| 138 |
+
"query": query,
|
| 139 |
+
"category": category,
|
| 140 |
+
"search_query": search_query,
|
| 141 |
+
"retrieval_success": True,
|
| 142 |
+
"average_relevance": average_relevance,
|
| 143 |
+
"max_relevance": max_relevance,
|
| 144 |
+
"min_relevance": min_relevance,
|
| 145 |
+
"relevance_scores": similarity_scores,
|
| 146 |
+
"high_relevance_count": high_relevance_count,
|
| 147 |
+
"high_relevance_ratio": high_relevance_ratio,
|
| 148 |
+
"retrieved_count": len(processed_results),
|
| 149 |
+
"retrieval_time": retrieval_time,
|
| 150 |
+
"retrieval_details": retrieval_details,
|
| 151 |
+
"meets_threshold": average_relevance >= 0.2,
|
| 152 |
+
"timestamp": datetime.now().isoformat()
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# Store result
|
| 156 |
+
self.relevance_results.append(result)
|
| 157 |
+
|
| 158 |
+
print(f" ✅ Retrieval: {len(processed_results)} documents")
|
| 159 |
+
print(f" 📊 Average Relevance: {average_relevance:.3f}")
|
| 160 |
+
print(f" 📈 High Relevance (≥0.2): {high_relevance_count}/{len(processed_results)} ({high_relevance_ratio:.1%})")
|
| 161 |
+
print(f" 🎯 Threshold: {'✅ Met' if result['meets_threshold'] else '❌ Not Met'}")
|
| 162 |
+
print(f" ⏱️ Retrieval Time: {retrieval_time:.3f}s")
|
| 163 |
+
|
| 164 |
+
return result
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
error_result = {
|
| 168 |
+
"query": query,
|
| 169 |
+
"category": category,
|
| 170 |
+
"retrieval_success": False,
|
| 171 |
+
"average_relevance": 0.0,
|
| 172 |
+
"error": str(e),
|
| 173 |
+
"timestamp": datetime.now().isoformat()
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
self.relevance_results.append(error_result)
|
| 177 |
+
print(f" ❌ Relevance evaluation failed: {e}")
|
| 178 |
+
|
| 179 |
+
return error_result
|
| 180 |
+
|
| 181 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
| 182 |
+
"""Parse queries from file with category labels"""
|
| 183 |
+
print(f"📁 Reading queries from file: {filepath}")
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 187 |
+
content = f.read()
|
| 188 |
+
|
| 189 |
+
# Parse queries with category labels
|
| 190 |
+
queries_by_category = {
|
| 191 |
+
"diagnosis": [],
|
| 192 |
+
"treatment": [],
|
| 193 |
+
"mixed": []
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
lines = content.strip().split('\n')
|
| 197 |
+
|
| 198 |
+
for line in lines:
|
| 199 |
+
line = line.strip()
|
| 200 |
+
if not line:
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
# Parse format: "1.diagnosis: query text"
|
| 204 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
| 205 |
+
if match:
|
| 206 |
+
category_raw = match.group(1).lower()
|
| 207 |
+
query_text = match.group(2).strip()
|
| 208 |
+
|
| 209 |
+
# Normalize category name
|
| 210 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
| 211 |
+
category = 'mixed'
|
| 212 |
+
else:
|
| 213 |
+
category = category_raw
|
| 214 |
+
|
| 215 |
+
if category in queries_by_category and len(query_text) > 15:
|
| 216 |
+
queries_by_category[category].append({
|
| 217 |
+
"text": query_text,
|
| 218 |
+
"category": category
|
| 219 |
+
})
|
| 220 |
+
|
| 221 |
+
print(f"📋 Parsed queries by category:")
|
| 222 |
+
for category, category_queries in queries_by_category.items():
|
| 223 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
| 224 |
+
|
| 225 |
+
return queries_by_category
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"❌ Failed to read file: {e}")
|
| 229 |
+
return {"error": f"Failed to read file: {e}"}
|
| 230 |
+
|
| 231 |
+
def calculate_relevance_statistics(self) -> Dict[str, Any]:
|
| 232 |
+
"""Calculate relevance statistics by category"""
|
| 233 |
+
category_stats = {}
|
| 234 |
+
all_successful_results = []
|
| 235 |
+
|
| 236 |
+
# Group results by category
|
| 237 |
+
results_by_category = {
|
| 238 |
+
"diagnosis": [],
|
| 239 |
+
"treatment": [],
|
| 240 |
+
"mixed": []
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
for result in self.relevance_results:
|
| 244 |
+
category = result.get('category', 'unknown')
|
| 245 |
+
if category in results_by_category:
|
| 246 |
+
results_by_category[category].append(result)
|
| 247 |
+
if result.get('retrieval_success'):
|
| 248 |
+
all_successful_results.append(result)
|
| 249 |
+
|
| 250 |
+
# Calculate statistics for each category
|
| 251 |
+
for category, results in results_by_category.items():
|
| 252 |
+
successful_results = [r for r in results if r.get('retrieval_success')]
|
| 253 |
+
|
| 254 |
+
if successful_results:
|
| 255 |
+
avg_relevance = sum(r['average_relevance'] for r in successful_results) / len(successful_results)
|
| 256 |
+
relevance_scores = [r['average_relevance'] for r in successful_results]
|
| 257 |
+
avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
|
| 258 |
+
|
| 259 |
+
category_stats[category] = {
|
| 260 |
+
"average_relevance": avg_relevance,
|
| 261 |
+
"max_relevance": max(relevance_scores),
|
| 262 |
+
"min_relevance": min(relevance_scores),
|
| 263 |
+
"successful_retrievals": len(successful_results),
|
| 264 |
+
"total_queries": len(results),
|
| 265 |
+
"success_rate": len(successful_results) / len(results),
|
| 266 |
+
"average_retrieval_time": avg_retrieval_time,
|
| 267 |
+
"meets_threshold": avg_relevance >= 0.2,
|
| 268 |
+
"individual_relevance_scores": relevance_scores
|
| 269 |
+
}
|
| 270 |
+
else:
|
| 271 |
+
category_stats[category] = {
|
| 272 |
+
"average_relevance": 0.0,
|
| 273 |
+
"max_relevance": 0.0,
|
| 274 |
+
"min_relevance": 0.0,
|
| 275 |
+
"successful_retrievals": 0,
|
| 276 |
+
"total_queries": len(results),
|
| 277 |
+
"success_rate": 0.0,
|
| 278 |
+
"average_retrieval_time": 0.0,
|
| 279 |
+
"meets_threshold": False,
|
| 280 |
+
"individual_relevance_scores": []
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
# Calculate overall statistics
|
| 284 |
+
if all_successful_results:
|
| 285 |
+
all_relevance_scores = [r['average_relevance'] for r in all_successful_results]
|
| 286 |
+
overall_stats = {
|
| 287 |
+
"average_relevance": sum(all_relevance_scores) / len(all_relevance_scores),
|
| 288 |
+
"max_relevance": max(all_relevance_scores),
|
| 289 |
+
"min_relevance": min(all_relevance_scores),
|
| 290 |
+
"successful_retrievals": len(all_successful_results),
|
| 291 |
+
"total_queries": len(self.relevance_results),
|
| 292 |
+
"success_rate": len(all_successful_results) / len(self.relevance_results),
|
| 293 |
+
"meets_threshold": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.2,
|
| 294 |
+
"target_compliance": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.25
|
| 295 |
+
}
|
| 296 |
+
else:
|
| 297 |
+
overall_stats = {
|
| 298 |
+
"average_relevance": 0.0,
|
| 299 |
+
"max_relevance": 0.0,
|
| 300 |
+
"min_relevance": 0.0,
|
| 301 |
+
"successful_retrievals": 0,
|
| 302 |
+
"total_queries": len(self.relevance_results),
|
| 303 |
+
"success_rate": 0.0,
|
| 304 |
+
"meets_threshold": False,
|
| 305 |
+
"target_compliance": False
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
return {
|
| 309 |
+
"category_results": category_stats,
|
| 310 |
+
"overall_results": overall_stats,
|
| 311 |
+
"timestamp": datetime.now().isoformat()
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
def save_relevance_statistics(self, filename: str = None) -> str:
|
| 315 |
+
"""Save relevance statistics for chart generation"""
|
| 316 |
+
stats = self.calculate_relevance_statistics()
|
| 317 |
+
|
| 318 |
+
if filename is None:
|
| 319 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 320 |
+
filename = f"relevance_statistics_{timestamp}.json"
|
| 321 |
+
|
| 322 |
+
# Ensure results directory exists
|
| 323 |
+
results_dir = Path(__file__).parent / "results"
|
| 324 |
+
results_dir.mkdir(exist_ok=True)
|
| 325 |
+
|
| 326 |
+
filepath = results_dir / filename
|
| 327 |
+
|
| 328 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 329 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 330 |
+
|
| 331 |
+
print(f"📊 Relevance statistics saved to: {filepath}")
|
| 332 |
+
return str(filepath)
|
| 333 |
+
|
| 334 |
+
def save_relevance_details(self, filename: str = None) -> str:
|
| 335 |
+
"""Save detailed relevance results"""
|
| 336 |
+
if filename is None:
|
| 337 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 338 |
+
filename = f"relevance_details_{timestamp}.json"
|
| 339 |
+
|
| 340 |
+
# Ensure results directory exists
|
| 341 |
+
results_dir = Path(__file__).parent / "results"
|
| 342 |
+
results_dir.mkdir(exist_ok=True)
|
| 343 |
+
|
| 344 |
+
filepath = results_dir / filename
|
| 345 |
+
|
| 346 |
+
# Create comprehensive relevance data
|
| 347 |
+
relevance_data = {
|
| 348 |
+
"evaluation_metadata": {
|
| 349 |
+
"total_queries": len(self.relevance_results),
|
| 350 |
+
"successful_retrievals": len([r for r in self.relevance_results if r.get('retrieval_success')]),
|
| 351 |
+
"timestamp": datetime.now().isoformat(),
|
| 352 |
+
"evaluator_type": "retrieval_relevance",
|
| 353 |
+
"threshold_used": 0.2
|
| 354 |
+
},
|
| 355 |
+
"relevance_results": self.relevance_results
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 359 |
+
json.dump(relevance_data, f, indent=2, ensure_ascii=False)
|
| 360 |
+
|
| 361 |
+
print(f"📝 Relevance details saved to: {filepath}")
|
| 362 |
+
return str(filepath)
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# Independent execution interface
|
| 366 |
+
if __name__ == "__main__":
|
| 367 |
+
"""Independent relevance evaluation interface"""
|
| 368 |
+
|
| 369 |
+
print("📊 OnCall.ai Relevance Evaluator - Retrieval Relevance Analysis")
|
| 370 |
+
|
| 371 |
+
if len(sys.argv) > 1:
|
| 372 |
+
query_file = sys.argv[1]
|
| 373 |
+
else:
|
| 374 |
+
# Default to evaluation/pre_user_query_evaluate.txt
|
| 375 |
+
query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
|
| 376 |
+
|
| 377 |
+
if not os.path.exists(query_file):
|
| 378 |
+
print(f"❌ Query file not found: {query_file}")
|
| 379 |
+
print("Usage: python relevance_evaluator.py [query_file.txt]")
|
| 380 |
+
sys.exit(1)
|
| 381 |
+
|
| 382 |
+
# Initialize evaluator
|
| 383 |
+
evaluator = RelevanceEvaluator()
|
| 384 |
+
|
| 385 |
+
# Parse queries from file
|
| 386 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
| 387 |
+
|
| 388 |
+
if "error" in queries_by_category:
|
| 389 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
| 390 |
+
sys.exit(1)
|
| 391 |
+
|
| 392 |
+
# Test relevance for each query
|
| 393 |
+
print(f"\n🧪 Retrieval Relevance Testing")
|
| 394 |
+
|
| 395 |
+
for category, queries in queries_by_category.items():
|
| 396 |
+
if not queries:
|
| 397 |
+
continue
|
| 398 |
+
|
| 399 |
+
print(f"\n📂 Testing {category.upper()} relevance:")
|
| 400 |
+
|
| 401 |
+
for i, query_info in enumerate(queries):
|
| 402 |
+
query_text = query_info['text']
|
| 403 |
+
|
| 404 |
+
# Test relevance
|
| 405 |
+
result = evaluator.evaluate_single_relevance(query_text, category)
|
| 406 |
+
|
| 407 |
+
# Pause between queries to avoid rate limits
|
| 408 |
+
if i < len(queries) - 1:
|
| 409 |
+
print(f" ⏳ Pausing 3s before next query...")
|
| 410 |
+
import time
|
| 411 |
+
time.sleep(3)
|
| 412 |
+
|
| 413 |
+
# Pause between categories
|
| 414 |
+
if category != list(queries_by_category.keys())[-1]:
|
| 415 |
+
print(f"\n⏳ Pausing 5s before next category...")
|
| 416 |
+
import time
|
| 417 |
+
time.sleep(5)
|
| 418 |
+
|
| 419 |
+
# Generate and save results
|
| 420 |
+
print(f"\n📊 Generating relevance analysis...")
|
| 421 |
+
|
| 422 |
+
# Save statistics and details
|
| 423 |
+
stats_path = evaluator.save_relevance_statistics()
|
| 424 |
+
details_path = evaluator.save_relevance_details()
|
| 425 |
+
|
| 426 |
+
# Print final summary
|
| 427 |
+
stats = evaluator.calculate_relevance_statistics()
|
| 428 |
+
category_results = stats['category_results']
|
| 429 |
+
overall_results = stats['overall_results']
|
| 430 |
+
|
| 431 |
+
print(f"\n📊 === RELEVANCE EVALUATION SUMMARY ===")
|
| 432 |
+
print(f"Overall Performance:")
|
| 433 |
+
print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
|
| 434 |
+
print(f" Retrieval Success Rate: {overall_results['success_rate']:.1%}")
|
| 435 |
+
print(f" 0.2 Threshold: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
|
| 436 |
+
print(f" 0.25 Target: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
|
| 437 |
+
|
| 438 |
+
print(f"\nCategory Breakdown:")
|
| 439 |
+
for category, cat_stats in category_results.items():
|
| 440 |
+
if cat_stats['total_queries'] > 0:
|
| 441 |
+
print(f" {category.capitalize()}: {cat_stats['average_relevance']:.3f} "
|
| 442 |
+
f"({cat_stats['successful_retrievals']}/{cat_stats['total_queries']}) "
|
| 443 |
+
f"[{cat_stats['average_retrieval_time']:.3f}s avg]")
|
| 444 |
+
|
| 445 |
+
print(f"\n✅ Relevance evaluation complete!")
|
| 446 |
+
print(f"📊 Statistics: {stats_path}")
|
| 447 |
+
print(f"📝 Details: {details_path}")
|
evaluation/pre_user_query_evaluate.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
|
| 2 |
+
|
| 3 |
+
2.treatment: Suspected acute ischemic stroke. Tell me the next steps to take
|
| 4 |
+
|
| 5 |
+
3.mixed/complicated: 20 y/f , porphyria, sudden seizure. What are possible causes and complete management workflow?
|
evaluation/single_test_query.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
|
evaluation/user_query.txt
CHANGED
|
@@ -17,18 +17,18 @@ Suspected acute ischemic stroke. Tell me the next steps to take
|
|
| 17 |
|
| 18 |
### 一、Diagnosis-Focused(診斷為主)
|
| 19 |
|
| 20 |
-
1. I have a 68-year-old man with atrial fibrillation presenting with sudden slurred speech and right-sided weakness
|
| 21 |
-
2. A 40-year-old woman reports fever, urinary frequency, and dysuria
|
| 22 |
-
3. A 50-year-old patient has progressive dyspnea on exertion and orthopnea over two weeks
|
| 23 |
|
| 24 |
### 二、Treatment-Focused(治療為主)
|
| 25 |
|
| 26 |
-
4. ECG shows a suspected acute STEMI
|
| 27 |
-
5. I have a patient diagnosed with bacterial meningitis
|
| 28 |
6. A patient is in septic shock with BP 80/50 mmHg and HR 120 bpm—what fluid resuscitation and vasopressor strategy would you recommend?
|
| 29 |
|
| 30 |
### 三、Mixed(診斷+治療綜合)
|
| 31 |
|
| 32 |
7. A 75-year-old diabetic presents with a non-healing foot ulcer and fever—what differential for osteomyelitis, diagnostic workup, and management plan do you suggest?
|
| 33 |
-
8. A 60-year-old COPD patient has worsening dyspnea and hypercapnia on ABG
|
| 34 |
-
9. A 28-year-old woman is experiencing postpartum hemorrhage
|
|
|
|
| 17 |
|
| 18 |
### 一、Diagnosis-Focused(診斷為主)
|
| 19 |
|
| 20 |
+
1. I have a 68-year-old man with atrial fibrillation presenting with sudden slurred speech and right-sided weakness. what are the possible diagnoses, and how would you evaluate them?
|
| 21 |
+
2. A 40-year-old woman reports fever, urinary frequency, and dysuria. what differential diagnoses should I consider, and which tests would you order?
|
| 22 |
+
3. A 50-year-old patient has progressive dyspnea on exertion and orthopnea over two weeks. what are the likely causes, and what diagnostic steps should I take?
|
| 23 |
|
| 24 |
### 二、Treatment-Focused(治療為主)
|
| 25 |
|
| 26 |
+
4. ECG shows a suspected acute STEMI. what immediate interventions should I initiate in the next five minutes?
|
| 27 |
+
5. I have a patient diagnosed with bacterial meningitis. What empiric antibiotic regimen and supportive measures should I implement?
|
| 28 |
6. A patient is in septic shock with BP 80/50 mmHg and HR 120 bpm—what fluid resuscitation and vasopressor strategy would you recommend?
|
| 29 |
|
| 30 |
### 三、Mixed(診斷+治療綜合)
|
| 31 |
|
| 32 |
7. A 75-year-old diabetic presents with a non-healing foot ulcer and fever—what differential for osteomyelitis, diagnostic workup, and management plan do you suggest?
|
| 33 |
+
8. A 60-year-old COPD patient has worsening dyspnea and hypercapnia on ABG. How would you confirm the diagnosis, and what is your stepwise treatment approach?
|
| 34 |
+
9. A 28-year-old woman is experiencing postpartum hemorrhage. what are the possible causes, what immediate resuscitation steps should I take, and how would you proceed with definitive management?
|
src/generation.py
CHANGED
|
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
|
|
| 30 |
|
| 31 |
# Fallback Generation Configuration (Simplified Architecture)
|
| 32 |
FALLBACK_TIMEOUTS = {
|
| 33 |
-
"primary":
|
| 34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
| 35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
| 36 |
}
|
|
@@ -308,14 +308,14 @@ class MedicalAdviceGenerator:
|
|
| 308 |
# Special formatting for hospital-specific guidelines
|
| 309 |
source_label = "Hospital Protocol"
|
| 310 |
context_part = f"""
|
| 311 |
-
[Guideline {i}] (Source: {source_label}, Relevance: {1-distance:.3f})
|
| 312 |
-
📋 {chunk.get('matched', 'Hospital Document')}
|
| 313 |
-
{chunk_text}
|
| 314 |
""".strip()
|
| 315 |
else:
|
| 316 |
context_part = f"""
|
| 317 |
-
[Guideline {i}] (Source: {chunk_type.title()},
|
| 318 |
-
{chunk_text}
|
| 319 |
""".strip()
|
| 320 |
|
| 321 |
context_parts.append(context_part)
|
|
|
|
| 30 |
|
| 31 |
# Fallback Generation Configuration (Simplified Architecture)
|
| 32 |
FALLBACK_TIMEOUTS = {
|
| 33 |
+
"primary": 60.0, # Primary Med42-70B increased timeout for stable evaluation
|
| 34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
| 35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
| 36 |
}
|
|
|
|
| 308 |
# Special formatting for hospital-specific guidelines
|
| 309 |
source_label = "Hospital Protocol"
|
| 310 |
context_part = f"""
|
| 311 |
+
[Guideline {i}] (Source: {source_label}, Relevance: {1-distance:.3f})
|
| 312 |
+
📋 {chunk.get('matched', 'Hospital Document')}
|
| 313 |
+
{chunk_text}
|
| 314 |
""".strip()
|
| 315 |
else:
|
| 316 |
context_part = f"""
|
| 317 |
+
[Guideline {i}] (Source: {chunk_type.title()}, Angular Distance: {distance:.3f})
|
| 318 |
+
{chunk_text}
|
| 319 |
""".strip()
|
| 320 |
|
| 321 |
context_parts.append(context_part)
|
src/llm_clients.py
CHANGED
|
@@ -9,6 +9,8 @@ Date: 2025-07-29
|
|
| 9 |
|
| 10 |
import logging
|
| 11 |
import os
|
|
|
|
|
|
|
| 12 |
from typing import Dict, Optional, Union, List
|
| 13 |
from huggingface_hub import InferenceClient
|
| 14 |
from dotenv import load_dotenv
|
|
@@ -68,6 +70,91 @@ class llm_Med42_70BClient:
|
|
| 68 |
self.logger.error(f"Detailed Error: {repr(e)}")
|
| 69 |
raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
def analyze_medical_query(
|
| 72 |
self,
|
| 73 |
query: str,
|
|
@@ -138,6 +225,13 @@ class llm_Med42_70BClient:
|
|
| 138 |
self.logger.info(f"Raw LLM Response: {response_text}")
|
| 139 |
self.logger.info(f"Query Latency: {latency:.4f} seconds")
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
# Detect abnormal response
|
| 142 |
if self._is_abnormal_response(response_text):
|
| 143 |
self.logger.error(f"❌ Abnormal LLM response detected: {response_text[:50]}...")
|
|
@@ -149,15 +243,12 @@ class llm_Med42_70BClient:
|
|
| 149 |
'latency': latency
|
| 150 |
}
|
| 151 |
|
| 152 |
-
# Extract condition from response
|
| 153 |
-
extracted_condition = self._extract_condition(response_text)
|
| 154 |
-
|
| 155 |
# Log the extracted condition
|
| 156 |
self.logger.info(f"Extracted Condition: {extracted_condition}")
|
| 157 |
|
| 158 |
return {
|
| 159 |
'extracted_condition': extracted_condition,
|
| 160 |
-
'confidence':
|
| 161 |
'raw_response': response_text,
|
| 162 |
'latency': latency # Add latency to the return dictionary
|
| 163 |
}
|
|
@@ -264,7 +355,7 @@ Focus on: conditions, symptoms, procedures, body systems."""
|
|
| 264 |
|
| 265 |
def _extract_condition(self, response: str) -> str:
|
| 266 |
"""
|
| 267 |
-
Extract medical condition from model response.
|
| 268 |
|
| 269 |
Args:
|
| 270 |
response: Full model-generated text
|
|
@@ -272,18 +363,29 @@ Focus on: conditions, symptoms, procedures, body systems."""
|
|
| 272 |
Returns:
|
| 273 |
Extracted medical condition or empty string if non-medical
|
| 274 |
"""
|
|
|
|
|
|
|
| 275 |
# Check if this is a rejection response first
|
| 276 |
if self._is_rejection_response(response):
|
| 277 |
return ""
|
| 278 |
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
-
#
|
| 282 |
for condition in CONDITION_KEYWORD_MAPPING.keys():
|
| 283 |
if condition.lower() in response.lower():
|
| 284 |
return condition
|
| 285 |
|
| 286 |
-
return
|
| 287 |
|
| 288 |
def _is_abnormal_response(self, response: str) -> bool:
|
| 289 |
"""
|
|
@@ -439,5 +541,136 @@ def main():
|
|
| 439 |
'total_execution_time': total_execution_time
|
| 440 |
}
|
| 441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
if __name__ == "__main__":
|
| 443 |
main()
|
|
|
|
| 9 |
|
| 10 |
import logging
|
| 11 |
import os
|
| 12 |
+
import json
|
| 13 |
+
import re
|
| 14 |
from typing import Dict, Optional, Union, List
|
| 15 |
from huggingface_hub import InferenceClient
|
| 16 |
from dotenv import load_dotenv
|
|
|
|
| 70 |
self.logger.error(f"Detailed Error: {repr(e)}")
|
| 71 |
raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
|
| 72 |
|
| 73 |
+
def fix_json_formatting(self, response_text: str) -> str:
|
| 74 |
+
"""
|
| 75 |
+
Fix common JSON formatting errors
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
response_text: Raw response text that may contain JSON errors
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
Fixed JSON string
|
| 82 |
+
"""
|
| 83 |
+
# 1. Fix missing commas between key-value pairs
|
| 84 |
+
# Look for "value" "key" pattern and add comma
|
| 85 |
+
fixed = re.sub(r'"\s*\n\s*"', '",\n "', response_text)
|
| 86 |
+
|
| 87 |
+
# 2. Fix missing commas between values and keys
|
| 88 |
+
fixed = re.sub(r'"\s*(["\[])', '",\1', fixed)
|
| 89 |
+
|
| 90 |
+
# 3. Remove trailing commas
|
| 91 |
+
fixed = re.sub(r',\s*}', '}', fixed)
|
| 92 |
+
fixed = re.sub(r',\s*]', ']', fixed)
|
| 93 |
+
|
| 94 |
+
# 4. Ensure string values are properly quoted
|
| 95 |
+
fixed = re.sub(r':\s*([^",{}\[\]]+)\s*([,}])', r': "\1"\2', fixed)
|
| 96 |
+
|
| 97 |
+
return fixed
|
| 98 |
+
|
| 99 |
+
def parse_medical_response(self, response_text: str) -> Dict:
|
| 100 |
+
"""
|
| 101 |
+
Enhanced JSON parsing logic with error recovery
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
response_text: Raw response text from Med42-70B
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Parsed response dictionary
|
| 108 |
+
"""
|
| 109 |
+
try:
|
| 110 |
+
return json.loads(response_text)
|
| 111 |
+
except json.JSONDecodeError as e:
|
| 112 |
+
self.logger.warning(f"Initial JSON parsing failed: {e}")
|
| 113 |
+
|
| 114 |
+
# Attempt to fix common JSON errors
|
| 115 |
+
try:
|
| 116 |
+
fixed_response = self.fix_json_formatting(response_text)
|
| 117 |
+
self.logger.info("Attempting to parse fixed JSON")
|
| 118 |
+
return json.loads(fixed_response)
|
| 119 |
+
except json.JSONDecodeError as e2:
|
| 120 |
+
self.logger.error(f"Fixed JSON parsing also failed: {e2}")
|
| 121 |
+
|
| 122 |
+
# Try to extract partial information
|
| 123 |
+
try:
|
| 124 |
+
return self.extract_partial_medical_info(response_text)
|
| 125 |
+
except:
|
| 126 |
+
# Final fallback format
|
| 127 |
+
return {
|
| 128 |
+
"extracted_condition": "parsing_error",
|
| 129 |
+
"confidence": "0.0",
|
| 130 |
+
"is_medical": True,
|
| 131 |
+
"raw_response": response_text,
|
| 132 |
+
"error": str(e)
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def extract_partial_medical_info(self, response_text: str) -> Dict:
|
| 136 |
+
"""
|
| 137 |
+
Extract partial medical information from malformed response
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
response_text: Malformed response text
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
Dictionary with extracted information
|
| 144 |
+
"""
|
| 145 |
+
# Try to extract condition
|
| 146 |
+
condition_match = re.search(r'"extracted_condition":\s*"([^"]*)"', response_text)
|
| 147 |
+
confidence_match = re.search(r'"confidence":\s*"([^"]*)"', response_text)
|
| 148 |
+
medical_match = re.search(r'"is_medical":\s*(true|false)', response_text)
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
"extracted_condition": condition_match.group(1) if condition_match else "unknown",
|
| 152 |
+
"confidence": confidence_match.group(1) if confidence_match else "0.0",
|
| 153 |
+
"is_medical": medical_match.group(1) == "true" if medical_match else True,
|
| 154 |
+
"raw_response": response_text,
|
| 155 |
+
"parsing_method": "partial_extraction"
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
def analyze_medical_query(
|
| 159 |
self,
|
| 160 |
query: str,
|
|
|
|
| 225 |
self.logger.info(f"Raw LLM Response: {response_text}")
|
| 226 |
self.logger.info(f"Query Latency: {latency:.4f} seconds")
|
| 227 |
|
| 228 |
+
# Direct text extraction - system prompt expects plain text response
|
| 229 |
+
# Since the system prompt instructs LLM to "Return ONLY the primary condition name",
|
| 230 |
+
# we should directly extract from text instead of attempting JSON parsing
|
| 231 |
+
extracted_condition = self._extract_condition(response_text)
|
| 232 |
+
confidence = '0.8'
|
| 233 |
+
self.logger.info(f"Extracted condition from text: {extracted_condition}")
|
| 234 |
+
|
| 235 |
# Detect abnormal response
|
| 236 |
if self._is_abnormal_response(response_text):
|
| 237 |
self.logger.error(f"❌ Abnormal LLM response detected: {response_text[:50]}...")
|
|
|
|
| 243 |
'latency': latency
|
| 244 |
}
|
| 245 |
|
|
|
|
|
|
|
|
|
|
| 246 |
# Log the extracted condition
|
| 247 |
self.logger.info(f"Extracted Condition: {extracted_condition}")
|
| 248 |
|
| 249 |
return {
|
| 250 |
'extracted_condition': extracted_condition,
|
| 251 |
+
'confidence': confidence,
|
| 252 |
'raw_response': response_text,
|
| 253 |
'latency': latency # Add latency to the return dictionary
|
| 254 |
}
|
|
|
|
| 355 |
|
| 356 |
def _extract_condition(self, response: str) -> str:
|
| 357 |
"""
|
| 358 |
+
Extract medical condition from model response with support for multiple formats.
|
| 359 |
|
| 360 |
Args:
|
| 361 |
response: Full model-generated text
|
|
|
|
| 363 |
Returns:
|
| 364 |
Extracted medical condition or empty string if non-medical
|
| 365 |
"""
|
| 366 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING
|
| 367 |
+
|
| 368 |
# Check if this is a rejection response first
|
| 369 |
if self._is_rejection_response(response):
|
| 370 |
return ""
|
| 371 |
|
| 372 |
+
# Try CONDITION: format first (primary format for structured responses)
|
| 373 |
+
match = re.search(r"CONDITION:\s*(.+)", response, re.IGNORECASE)
|
| 374 |
+
if not match:
|
| 375 |
+
# Try Primary condition: format as fallback
|
| 376 |
+
match = re.search(r"Primary condition:\s*(.+)", response, re.IGNORECASE)
|
| 377 |
+
|
| 378 |
+
if match:
|
| 379 |
+
value = match.group(1).strip()
|
| 380 |
+
if value.upper() not in ["NONE", "", "UNKNOWN"]:
|
| 381 |
+
return value
|
| 382 |
|
| 383 |
+
# Final fallback to keyword mapping for backward compatibility
|
| 384 |
for condition in CONDITION_KEYWORD_MAPPING.keys():
|
| 385 |
if condition.lower() in response.lower():
|
| 386 |
return condition
|
| 387 |
|
| 388 |
+
return ""
|
| 389 |
|
| 390 |
def _is_abnormal_response(self, response: str) -> bool:
|
| 391 |
"""
|
|
|
|
| 541 |
'total_execution_time': total_execution_time
|
| 542 |
}
|
| 543 |
|
| 544 |
+
|
| 545 |
+
class llm_Llama3_70B_JudgeClient:
|
| 546 |
+
"""
|
| 547 |
+
Llama3-70B client specifically for LLM judge evaluation.
|
| 548 |
+
Used for metrics 5-6 evaluation: Clinical Actionability & Evidence Quality.
|
| 549 |
+
"""
|
| 550 |
+
|
| 551 |
+
def __init__(
|
| 552 |
+
self,
|
| 553 |
+
model_name: str = "meta-llama/Meta-Llama-3-70B-Instruct",
|
| 554 |
+
timeout: float = 60.0
|
| 555 |
+
):
|
| 556 |
+
"""
|
| 557 |
+
Initialize Llama3-70B judge client for evaluation tasks.
|
| 558 |
+
|
| 559 |
+
Args:
|
| 560 |
+
model_name: Hugging Face model name for Llama3-70B
|
| 561 |
+
timeout: API call timeout duration (longer for judge evaluation)
|
| 562 |
+
|
| 563 |
+
Note: This client is specifically designed for third-party evaluation,
|
| 564 |
+
not for medical advice generation.
|
| 565 |
+
"""
|
| 566 |
+
self.logger = logging.getLogger(__name__)
|
| 567 |
+
self.timeout = timeout
|
| 568 |
+
self.model_name = model_name
|
| 569 |
+
|
| 570 |
+
# Get Hugging Face token from environment
|
| 571 |
+
hf_token = os.getenv('HF_TOKEN')
|
| 572 |
+
if not hf_token:
|
| 573 |
+
self.logger.error("HF_TOKEN is missing from environment variables.")
|
| 574 |
+
raise ValueError(
|
| 575 |
+
"HF_TOKEN not found in environment variables. "
|
| 576 |
+
"Please set HF_TOKEN in your .env file or environment."
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
# Initialize Hugging Face Inference Client for judge evaluation
|
| 580 |
+
try:
|
| 581 |
+
self.client = InferenceClient(
|
| 582 |
+
provider="auto",
|
| 583 |
+
api_key=hf_token,
|
| 584 |
+
)
|
| 585 |
+
self.logger.info(f"Llama3-70B judge client initialized with model: {model_name}")
|
| 586 |
+
self.logger.info("Judge LLM: Evaluation tool only. Not for medical advice generation.")
|
| 587 |
+
|
| 588 |
+
except Exception as e:
|
| 589 |
+
self.logger.error(f"Failed to initialize Llama3-70B judge client: {e}")
|
| 590 |
+
raise
|
| 591 |
+
|
| 592 |
+
def generate_completion(self, prompt: str) -> Dict[str, Union[str, float]]:
|
| 593 |
+
"""
|
| 594 |
+
Generate completion using Llama3-70B for judge evaluation.
|
| 595 |
+
|
| 596 |
+
Args:
|
| 597 |
+
prompt: Evaluation prompt for medical advice assessment
|
| 598 |
+
|
| 599 |
+
Returns:
|
| 600 |
+
Dict containing response content and timing information
|
| 601 |
+
"""
|
| 602 |
+
import time
|
| 603 |
+
|
| 604 |
+
start_time = time.time()
|
| 605 |
+
|
| 606 |
+
try:
|
| 607 |
+
self.logger.info(f"Calling Llama3-70B Judge with evaluation prompt ({len(prompt)} chars)")
|
| 608 |
+
|
| 609 |
+
# Call Llama3-70B for judge evaluation
|
| 610 |
+
completion = self.client.chat.completions.create(
|
| 611 |
+
model=self.model_name,
|
| 612 |
+
messages=[
|
| 613 |
+
{
|
| 614 |
+
"role": "user",
|
| 615 |
+
"content": prompt
|
| 616 |
+
}
|
| 617 |
+
],
|
| 618 |
+
max_tokens=2048, # Sufficient for evaluation responses
|
| 619 |
+
temperature=0.1, # Low temperature for consistent evaluation
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
# Extract response content
|
| 623 |
+
response_content = completion.choices[0].message.content
|
| 624 |
+
|
| 625 |
+
end_time = time.time()
|
| 626 |
+
latency = end_time - start_time
|
| 627 |
+
|
| 628 |
+
self.logger.info(f"Llama3-70B Judge Response: {response_content[:100]}...")
|
| 629 |
+
self.logger.info(f"Judge Evaluation Latency: {latency:.4f} seconds")
|
| 630 |
+
|
| 631 |
+
return {
|
| 632 |
+
'content': response_content,
|
| 633 |
+
'latency': latency,
|
| 634 |
+
'model': self.model_name,
|
| 635 |
+
'timestamp': time.time()
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
except Exception as e:
|
| 639 |
+
end_time = time.time()
|
| 640 |
+
error_latency = end_time - start_time
|
| 641 |
+
|
| 642 |
+
self.logger.error(f"Llama3-70B judge evaluation failed: {e}")
|
| 643 |
+
self.logger.error(f"Error occurred after {error_latency:.4f} seconds")
|
| 644 |
+
|
| 645 |
+
return {
|
| 646 |
+
'content': f"Judge evaluation error: {str(e)}",
|
| 647 |
+
'latency': error_latency,
|
| 648 |
+
'error': str(e),
|
| 649 |
+
'model': self.model_name,
|
| 650 |
+
'timestamp': time.time()
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
def batch_evaluate(self, evaluation_prompt: str) -> Dict[str, Union[str, float]]:
|
| 654 |
+
"""
|
| 655 |
+
Specialized method for batch evaluation of medical advice.
|
| 656 |
+
Alias for generate_completion with judge-specific logging.
|
| 657 |
+
|
| 658 |
+
Args:
|
| 659 |
+
evaluation_prompt: Batch evaluation prompt containing multiple queries
|
| 660 |
+
|
| 661 |
+
Returns:
|
| 662 |
+
Dict containing batch evaluation results and timing
|
| 663 |
+
"""
|
| 664 |
+
self.logger.info("Starting batch judge evaluation...")
|
| 665 |
+
result = self.generate_completion(evaluation_prompt)
|
| 666 |
+
|
| 667 |
+
if 'error' not in result:
|
| 668 |
+
self.logger.info(f"Batch evaluation completed successfully in {result['latency']:.2f}s")
|
| 669 |
+
else:
|
| 670 |
+
self.logger.error(f"Batch evaluation failed: {result.get('error', 'Unknown error')}")
|
| 671 |
+
|
| 672 |
+
return result
|
| 673 |
+
|
| 674 |
+
|
| 675 |
if __name__ == "__main__":
|
| 676 |
main()
|
src/medical_conditions.py
CHANGED
|
@@ -63,6 +63,14 @@ CONDITION_KEYWORD_MAPPING: Dict[str, Dict[str, str]] = {
|
|
| 63 |
"seizure disorder": {
|
| 64 |
"emergency": "seizure|status epilepticus|postictal state",
|
| 65 |
"treatment": "antiepileptic drugs|EEG monitoring|neurology consult"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
}
|
| 67 |
}
|
| 68 |
|
|
|
|
| 63 |
"seizure disorder": {
|
| 64 |
"emergency": "seizure|status epilepticus|postictal state",
|
| 65 |
"treatment": "antiepileptic drugs|EEG monitoring|neurology consult"
|
| 66 |
+
},
|
| 67 |
+
"postpartum hemorrhage": {
|
| 68 |
+
"emergency": "postpartum hemorrhage|uterine atony|placental retention|vaginal laceration",
|
| 69 |
+
"treatment": "uterine massage|IV oxytocin infusion|blood transfusion|surgical intervention"
|
| 70 |
+
},
|
| 71 |
+
"bacterial meningitis": {
|
| 72 |
+
"emergency": "bacterial meningitis|fever|headache|neck stiffness|altered mental status|meningitis|meningeal signs",
|
| 73 |
+
"treatment": "empiric antibiotics|ceftriaxone|vancomycin|dexamethasone|lumbar puncture"
|
| 74 |
}
|
| 75 |
}
|
| 76 |
|
src/user_prompt.py
CHANGED
|
@@ -255,13 +255,15 @@ Return ONLY the specified format."""
|
|
| 255 |
timeout=12.0 # Single call timeout
|
| 256 |
)
|
| 257 |
|
|
|
|
|
|
|
| 258 |
response_text = llama_response.get('extracted_condition', '').strip()
|
| 259 |
logger.info(f"🤖 Combined L2+4 result: {response_text}")
|
| 260 |
|
| 261 |
-
# Parse structured response
|
| 262 |
-
medical_status = self._extract_field(
|
| 263 |
-
condition_name = self._extract_field(
|
| 264 |
-
confidence = self._extract_field(
|
| 265 |
|
| 266 |
# Non-medical query detection
|
| 267 |
if medical_status == 'NO':
|
|
|
|
| 255 |
timeout=12.0 # Single call timeout
|
| 256 |
)
|
| 257 |
|
| 258 |
+
# Get both raw response and extracted condition
|
| 259 |
+
raw_response = llama_response.get('raw_response', '').strip()
|
| 260 |
response_text = llama_response.get('extracted_condition', '').strip()
|
| 261 |
logger.info(f"🤖 Combined L2+4 result: {response_text}")
|
| 262 |
|
| 263 |
+
# Parse structured response from raw LLM output (not extracted condition)
|
| 264 |
+
medical_status = self._extract_field(raw_response, 'MEDICAL')
|
| 265 |
+
condition_name = self._extract_field(raw_response, 'CONDITION')
|
| 266 |
+
confidence = self._extract_field(raw_response, 'CONFIDENCE')
|
| 267 |
|
| 268 |
# Non-medical query detection
|
| 269 |
if medical_status == 'NO':
|