Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
Β·
17613c8
1
Parent(s):
88e76fd
feat: Add Extraction, LLM Judge, and Relevance Chart Generators
Browse files- Implemented ExtractionChartGenerator to visualize extraction success rates from JSON statistics.
- Developed LLMJudgeEvaluator for batch evaluation of medical advice quality using Llama3-70B.
- Created RelevanceChartGenerator to generate charts for retrieval relevance metrics and cosine similarity analysis.
- Each generator includes methods for loading statistics, generating charts, and saving results.
- Added error handling and user-friendly messages for file operations and evaluations.
evaluation/coverage_chart_generator.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Coverage Chart Generator
|
| 4 |
+
===========================================
|
| 5 |
+
|
| 6 |
+
Generates retrieval coverage charts from saved statistics.
|
| 7 |
+
Shows how well generated advice utilizes retrieved content.
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import glob
|
| 20 |
+
|
| 21 |
+
# Visualization imports
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class CoverageChartGenerator:
|
| 29 |
+
"""Generate charts for retrieval coverage metrics"""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
"""Initialize chart generator"""
|
| 33 |
+
print("π Initializing Coverage Chart Generator...")
|
| 34 |
+
plt.style.use('default')
|
| 35 |
+
sns.set_palette("husl")
|
| 36 |
+
print("β
Chart Generator ready")
|
| 37 |
+
|
| 38 |
+
def load_latest_coverage_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 39 |
+
"""Load the most recent coverage statistics file"""
|
| 40 |
+
if results_dir is None:
|
| 41 |
+
results_dir = Path(__file__).parent / "results"
|
| 42 |
+
|
| 43 |
+
pattern = str(results_dir / "coverage_statistics_*.json")
|
| 44 |
+
stat_files = glob.glob(pattern)
|
| 45 |
+
|
| 46 |
+
if not stat_files:
|
| 47 |
+
raise FileNotFoundError(f"No coverage statistics files found in {results_dir}")
|
| 48 |
+
|
| 49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 50 |
+
print(f"π Loading coverage statistics from: {latest_file}")
|
| 51 |
+
|
| 52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 53 |
+
stats = json.load(f)
|
| 54 |
+
|
| 55 |
+
return stats
|
| 56 |
+
|
| 57 |
+
def generate_coverage_charts(self, stats: Dict[str, Any]) -> str:
|
| 58 |
+
"""Generate coverage analysis charts"""
|
| 59 |
+
try:
|
| 60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 61 |
+
fig.suptitle('OnCall.ai Retrieval Coverage Analysis', fontsize=16, fontweight='bold')
|
| 62 |
+
|
| 63 |
+
category_results = stats['category_results']
|
| 64 |
+
overall_results = stats['overall_results']
|
| 65 |
+
|
| 66 |
+
# Chart 1: Average Coverage by Category
|
| 67 |
+
ax1 = axes[0, 0]
|
| 68 |
+
categories = []
|
| 69 |
+
avg_coverages = []
|
| 70 |
+
|
| 71 |
+
for category, cat_stats in category_results.items():
|
| 72 |
+
if cat_stats['successful_evaluations'] > 0:
|
| 73 |
+
categories.append(category.replace('_', ' ').title())
|
| 74 |
+
avg_coverages.append(cat_stats['average_coverage'] * 100) # Convert to percentage
|
| 75 |
+
|
| 76 |
+
categories.append('Overall')
|
| 77 |
+
avg_coverages.append(overall_results['average_coverage'] * 100)
|
| 78 |
+
|
| 79 |
+
bars = ax1.bar(categories, avg_coverages, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
| 80 |
+
ax1.set_title('Average Coverage Score by Category', fontweight='bold')
|
| 81 |
+
ax1.set_ylabel('Coverage Score (%)')
|
| 82 |
+
ax1.set_xlabel('Query Category')
|
| 83 |
+
ax1.grid(True, alpha=0.3)
|
| 84 |
+
|
| 85 |
+
# Add target line
|
| 86 |
+
ax1.axhline(y=60, color='red', linestyle='--', alpha=0.7, label='60% Target')
|
| 87 |
+
ax1.legend()
|
| 88 |
+
|
| 89 |
+
# Add value labels
|
| 90 |
+
for bar, coverage in zip(bars, avg_coverages):
|
| 91 |
+
height = bar.get_height()
|
| 92 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
|
| 93 |
+
f'{coverage:.1f}%', ha='center', va='bottom', fontweight='bold')
|
| 94 |
+
|
| 95 |
+
# Chart 2: Coverage Distribution
|
| 96 |
+
ax2 = axes[0, 1]
|
| 97 |
+
|
| 98 |
+
# Collect all individual coverage scores
|
| 99 |
+
all_scores = []
|
| 100 |
+
|
| 101 |
+
for category, cat_stats in category_results.items():
|
| 102 |
+
if cat_stats.get('individual_coverage_scores'):
|
| 103 |
+
all_scores.extend([score * 100 for score in cat_stats['individual_coverage_scores']])
|
| 104 |
+
|
| 105 |
+
if all_scores:
|
| 106 |
+
# Create histogram
|
| 107 |
+
ax2.hist(all_scores, bins=15, alpha=0.7, color='lightcoral', edgecolor='black')
|
| 108 |
+
ax2.axvline(x=60, color='red', linestyle='--', alpha=0.7, label='60% Target')
|
| 109 |
+
ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.1f}%')
|
| 110 |
+
|
| 111 |
+
ax2.set_title('Coverage Score Distribution', fontweight='bold')
|
| 112 |
+
ax2.set_xlabel('Coverage Score (%)')
|
| 113 |
+
ax2.set_ylabel('Frequency')
|
| 114 |
+
ax2.legend()
|
| 115 |
+
ax2.grid(True, alpha=0.3)
|
| 116 |
+
else:
|
| 117 |
+
ax2.text(0.5, 0.5, 'No coverage data available', ha='center', va='center', transform=ax2.transAxes)
|
| 118 |
+
ax2.set_title('Coverage Score Distribution', fontweight='bold')
|
| 119 |
+
|
| 120 |
+
# Chart 3: Statistical Summary Table
|
| 121 |
+
ax3 = axes[1, 0]
|
| 122 |
+
ax3.axis('tight')
|
| 123 |
+
ax3.axis('off')
|
| 124 |
+
|
| 125 |
+
table_data = []
|
| 126 |
+
headers = ['Category', 'Avg Coverage', 'Min/Max', 'Success/Total', 'Target Met']
|
| 127 |
+
|
| 128 |
+
for category, cat_stats in category_results.items():
|
| 129 |
+
if cat_stats['total_queries'] > 0:
|
| 130 |
+
table_data.append([
|
| 131 |
+
category.replace('_', ' ').title(),
|
| 132 |
+
f"{cat_stats['average_coverage']:.3f}",
|
| 133 |
+
f"{cat_stats['min_coverage']:.3f}/{cat_stats['max_coverage']:.3f}",
|
| 134 |
+
f"{cat_stats['successful_evaluations']}/{cat_stats['total_queries']}",
|
| 135 |
+
'β
' if cat_stats.get('meets_threshold', False) else 'β'
|
| 136 |
+
])
|
| 137 |
+
|
| 138 |
+
table_data.append([
|
| 139 |
+
'Overall',
|
| 140 |
+
f"{overall_results['average_coverage']:.3f}",
|
| 141 |
+
f"{overall_results['min_coverage']:.3f}/{overall_results['max_coverage']:.3f}",
|
| 142 |
+
f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
|
| 143 |
+
'β
' if overall_results.get('meets_threshold', False) else 'β'
|
| 144 |
+
])
|
| 145 |
+
|
| 146 |
+
if table_data:
|
| 147 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
| 148 |
+
cellLoc='center', loc='center')
|
| 149 |
+
table.auto_set_font_size(False)
|
| 150 |
+
table.set_fontsize(10)
|
| 151 |
+
table.scale(1, 2)
|
| 152 |
+
|
| 153 |
+
# Style header
|
| 154 |
+
for i in range(len(headers)):
|
| 155 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 156 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
| 157 |
+
|
| 158 |
+
ax3.set_title('Coverage Statistics Summary', fontweight='bold', pad=20)
|
| 159 |
+
|
| 160 |
+
# Chart 4: Coverage Performance Radar/Gauge
|
| 161 |
+
ax4 = axes[1, 1]
|
| 162 |
+
|
| 163 |
+
# Create gauge-like visualization for overall coverage
|
| 164 |
+
overall_coverage_pct = overall_results['average_coverage'] * 100
|
| 165 |
+
|
| 166 |
+
# Pie chart as gauge
|
| 167 |
+
sizes = [overall_coverage_pct, 100 - overall_coverage_pct]
|
| 168 |
+
colors = ['#2ca02c' if overall_coverage_pct >= 60 else '#ff7f0e', '#f0f0f0']
|
| 169 |
+
|
| 170 |
+
wedges, texts, autotexts = ax4.pie(sizes, labels=['Covered', 'Not Covered'],
|
| 171 |
+
autopct='%1.1f%%',
|
| 172 |
+
colors=colors,
|
| 173 |
+
startangle=90,
|
| 174 |
+
counterclock=False)
|
| 175 |
+
|
| 176 |
+
# Add center text
|
| 177 |
+
ax4.text(0, 0, f'{overall_coverage_pct:.1f}%\nCoverage',
|
| 178 |
+
ha='center', va='center', fontsize=14, fontweight='bold')
|
| 179 |
+
|
| 180 |
+
ax4.set_title(f'Overall Coverage Performance\n{"β
Target Met" if overall_coverage_pct >= 60 else "β Below Target"}',
|
| 181 |
+
fontweight='bold')
|
| 182 |
+
|
| 183 |
+
plt.tight_layout()
|
| 184 |
+
|
| 185 |
+
# Save chart
|
| 186 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 187 |
+
chart_filename = f"coverage_analysis_charts_{timestamp}.png"
|
| 188 |
+
|
| 189 |
+
results_dir = Path(__file__).parent / "results"
|
| 190 |
+
results_dir.mkdir(exist_ok=True)
|
| 191 |
+
chart_path = results_dir / chart_filename
|
| 192 |
+
|
| 193 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
| 194 |
+
plt.close()
|
| 195 |
+
|
| 196 |
+
print(f"π Coverage charts saved to: {chart_path}")
|
| 197 |
+
return str(chart_path)
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"β Coverage chart generation failed: {e}")
|
| 201 |
+
return ""
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
if __name__ == "__main__":
|
| 205 |
+
"""Independent coverage chart generation"""
|
| 206 |
+
|
| 207 |
+
print("π OnCall.ai Coverage Chart Generator")
|
| 208 |
+
|
| 209 |
+
chart_gen = CoverageChartGenerator()
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
stats = chart_gen.load_latest_coverage_statistics()
|
| 213 |
+
chart_path = chart_gen.generate_coverage_charts(stats)
|
| 214 |
+
|
| 215 |
+
print(f"\nβ
Coverage chart generation complete!")
|
| 216 |
+
print(f"π Charts saved to: {chart_path}")
|
| 217 |
+
|
| 218 |
+
except FileNotFoundError as e:
|
| 219 |
+
print(f"β {e}")
|
| 220 |
+
print("π‘ Please run latency_evaluator.py first to generate coverage statistics data")
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"β Chart generation failed: {e}")
|
evaluation/direct_llm_evaluator.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Direct LLM Evaluator (Med42-70B Only)
|
| 4 |
+
========================================================
|
| 5 |
+
|
| 6 |
+
Tests Med42-70B directly without RAG pipeline.
|
| 7 |
+
Only applicable metrics: 1 (Latency), 5 (Actionability), 6 (Evidence Quality)
|
| 8 |
+
|
| 9 |
+
Metrics 2-4 (Extraction, Relevance, Coverage) are not applicable for direct LLM.
|
| 10 |
+
|
| 11 |
+
Author: YanBo Chen
|
| 12 |
+
Date: 2025-08-04
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import time
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
from typing import Dict, List, Any
|
| 20 |
+
from datetime import datetime
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
import re
|
| 23 |
+
|
| 24 |
+
# Add project path
|
| 25 |
+
current_dir = Path(__file__).parent
|
| 26 |
+
project_root = current_dir.parent
|
| 27 |
+
src_dir = project_root / "src"
|
| 28 |
+
sys.path.insert(0, str(src_dir))
|
| 29 |
+
|
| 30 |
+
# Import LLM client only (no retrieval system needed)
|
| 31 |
+
try:
|
| 32 |
+
from llm_clients import llm_Med42_70BClient
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
print(f"β Import failed: {e}")
|
| 35 |
+
print("Please ensure running from project root directory")
|
| 36 |
+
sys.exit(1)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class DirectLLMEvaluator:
|
| 40 |
+
"""Direct LLM evaluation without RAG pipeline"""
|
| 41 |
+
|
| 42 |
+
def __init__(self):
|
| 43 |
+
"""Initialize direct LLM client only"""
|
| 44 |
+
print("π§ Initializing Direct LLM Evaluator...")
|
| 45 |
+
|
| 46 |
+
# Initialize only LLM client (no retrieval, no user_prompt processing)
|
| 47 |
+
self.llm_client = llm_Med42_70BClient()
|
| 48 |
+
|
| 49 |
+
# Results accumulation
|
| 50 |
+
self.direct_results = []
|
| 51 |
+
self.medical_outputs = []
|
| 52 |
+
|
| 53 |
+
print("β
Direct LLM Evaluator initialization complete")
|
| 54 |
+
|
| 55 |
+
def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
| 56 |
+
"""
|
| 57 |
+
Direct LLM evaluation for single query
|
| 58 |
+
|
| 59 |
+
Only tests direct LLM response without RAG pipeline
|
| 60 |
+
Applicable metrics: 1 (Latency), 5-6 (via medical output)
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
query: Medical query to test
|
| 64 |
+
category: Query category (diagnosis/treatment/mixed)
|
| 65 |
+
"""
|
| 66 |
+
print(f"π Direct LLM evaluation: {query[:50]}...")
|
| 67 |
+
print(f"π Category: {category}")
|
| 68 |
+
|
| 69 |
+
overall_start = time.time()
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
# Direct LLM call without any RAG processing
|
| 73 |
+
llm_start = time.time()
|
| 74 |
+
|
| 75 |
+
# Create direct medical consultation prompt
|
| 76 |
+
direct_prompt = f"""
|
| 77 |
+
You are a medical expert providing clinical guidance.
|
| 78 |
+
|
| 79 |
+
Patient Query: {query}
|
| 80 |
+
|
| 81 |
+
Please provide comprehensive medical advice including:
|
| 82 |
+
1. Differential diagnosis (if applicable)
|
| 83 |
+
2. Immediate assessment steps
|
| 84 |
+
3. Treatment recommendations
|
| 85 |
+
4. Clinical considerations
|
| 86 |
+
|
| 87 |
+
Provide evidence-based, actionable medical guidance.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
# Direct LLM generation
|
| 91 |
+
response = self.llm_client.generate_completion(direct_prompt)
|
| 92 |
+
medical_advice = response.get('content', '') if isinstance(response, dict) else str(response)
|
| 93 |
+
|
| 94 |
+
llm_time = time.time() - llm_start
|
| 95 |
+
total_time = time.time() - overall_start
|
| 96 |
+
|
| 97 |
+
# Create result
|
| 98 |
+
result = {
|
| 99 |
+
"query": query,
|
| 100 |
+
"category": category,
|
| 101 |
+
|
| 102 |
+
# Metric 1: Total Latency (direct LLM call time)
|
| 103 |
+
"latency_metrics": {
|
| 104 |
+
"total_latency": total_time,
|
| 105 |
+
"llm_generation_time": llm_time,
|
| 106 |
+
"meets_target": total_time <= 30.0
|
| 107 |
+
},
|
| 108 |
+
|
| 109 |
+
# Metrics 2-4: Not applicable for direct LLM
|
| 110 |
+
"extraction_metrics": {
|
| 111 |
+
"not_applicable": True,
|
| 112 |
+
"reason": "No extraction pipeline in direct LLM"
|
| 113 |
+
},
|
| 114 |
+
"relevance_metrics": {
|
| 115 |
+
"not_applicable": True,
|
| 116 |
+
"reason": "No retrieval pipeline in direct LLM"
|
| 117 |
+
},
|
| 118 |
+
"coverage_metrics": {
|
| 119 |
+
"not_applicable": True,
|
| 120 |
+
"reason": "No retrieval content to cover"
|
| 121 |
+
},
|
| 122 |
+
|
| 123 |
+
# Medical advice for metrics 5-6 evaluation
|
| 124 |
+
"medical_advice": medical_advice,
|
| 125 |
+
"advice_length": len(medical_advice),
|
| 126 |
+
|
| 127 |
+
"overall_success": True,
|
| 128 |
+
"model_type": "Med42-70B_direct",
|
| 129 |
+
"timestamp": datetime.now().isoformat()
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
# Store result
|
| 133 |
+
self.direct_results.append(result)
|
| 134 |
+
|
| 135 |
+
# Store medical output for LLM judge evaluation
|
| 136 |
+
medical_output = {
|
| 137 |
+
"query": query,
|
| 138 |
+
"category": category,
|
| 139 |
+
"medical_advice": medical_advice,
|
| 140 |
+
"query_id": f"{category}_query_direct",
|
| 141 |
+
"model_type": "Med42-70B_direct",
|
| 142 |
+
"processing_time": total_time,
|
| 143 |
+
"timestamp": datetime.now().isoformat()
|
| 144 |
+
}
|
| 145 |
+
self.medical_outputs.append(medical_output)
|
| 146 |
+
|
| 147 |
+
print(f"β
Direct LLM completed in {total_time:.2f}s")
|
| 148 |
+
print(f"π Generated advice: {len(medical_advice)} characters")
|
| 149 |
+
|
| 150 |
+
return result
|
| 151 |
+
|
| 152 |
+
except Exception as e:
|
| 153 |
+
total_time = time.time() - overall_start
|
| 154 |
+
print(f"β Direct LLM evaluation failed after {total_time:.2f}s: {e}")
|
| 155 |
+
|
| 156 |
+
error_result = {
|
| 157 |
+
"query": query,
|
| 158 |
+
"category": category,
|
| 159 |
+
"latency_metrics": {
|
| 160 |
+
"total_latency": total_time,
|
| 161 |
+
"meets_target": False
|
| 162 |
+
},
|
| 163 |
+
"overall_success": False,
|
| 164 |
+
"error": str(e),
|
| 165 |
+
"model_type": "Med42-70B_direct",
|
| 166 |
+
"timestamp": datetime.now().isoformat()
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
self.direct_results.append(error_result)
|
| 170 |
+
return error_result
|
| 171 |
+
|
| 172 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
| 173 |
+
"""Parse queries from file with category labels"""
|
| 174 |
+
print(f"π Reading queries from file: {filepath}")
|
| 175 |
+
|
| 176 |
+
try:
|
| 177 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 178 |
+
content = f.read()
|
| 179 |
+
|
| 180 |
+
queries_by_category = {
|
| 181 |
+
"diagnosis": [],
|
| 182 |
+
"treatment": [],
|
| 183 |
+
"mixed": []
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
lines = content.strip().split('\n')
|
| 187 |
+
|
| 188 |
+
for line in lines:
|
| 189 |
+
line = line.strip()
|
| 190 |
+
if not line:
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
| 194 |
+
if match:
|
| 195 |
+
category_raw = match.group(1).lower()
|
| 196 |
+
query_text = match.group(2).strip()
|
| 197 |
+
|
| 198 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
| 199 |
+
category = 'mixed'
|
| 200 |
+
else:
|
| 201 |
+
category = category_raw
|
| 202 |
+
|
| 203 |
+
if category in queries_by_category and len(query_text) > 15:
|
| 204 |
+
queries_by_category[category].append({
|
| 205 |
+
"text": query_text,
|
| 206 |
+
"category": category
|
| 207 |
+
})
|
| 208 |
+
|
| 209 |
+
print(f"π Parsed queries by category:")
|
| 210 |
+
for category, category_queries in queries_by_category.items():
|
| 211 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
| 212 |
+
|
| 213 |
+
return queries_by_category
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
print(f"β Failed to read file: {e}")
|
| 217 |
+
return {"error": f"Failed to read file: {e}"}
|
| 218 |
+
|
| 219 |
+
def calculate_direct_llm_statistics(self) -> Dict[str, Any]:
|
| 220 |
+
"""Calculate statistics for direct LLM evaluation"""
|
| 221 |
+
successful_results = [r for r in self.direct_results if r.get('overall_success')]
|
| 222 |
+
|
| 223 |
+
if successful_results:
|
| 224 |
+
latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
|
| 225 |
+
|
| 226 |
+
# Category-wise statistics
|
| 227 |
+
category_stats = {}
|
| 228 |
+
results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
|
| 229 |
+
|
| 230 |
+
for result in successful_results:
|
| 231 |
+
category = result.get('category', 'unknown')
|
| 232 |
+
if category in results_by_category:
|
| 233 |
+
results_by_category[category].append(result)
|
| 234 |
+
|
| 235 |
+
for category, results in results_by_category.items():
|
| 236 |
+
if results:
|
| 237 |
+
cat_latencies = [r['latency_metrics']['total_latency'] for r in results]
|
| 238 |
+
category_stats[category] = {
|
| 239 |
+
"average_latency": sum(cat_latencies) / len(cat_latencies),
|
| 240 |
+
"query_count": len(cat_latencies),
|
| 241 |
+
"target_compliance": sum(1 for lat in cat_latencies if lat <= 30.0) / len(cat_latencies)
|
| 242 |
+
}
|
| 243 |
+
else:
|
| 244 |
+
category_stats[category] = {
|
| 245 |
+
"average_latency": 0.0,
|
| 246 |
+
"query_count": 0,
|
| 247 |
+
"target_compliance": 0.0
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
# Overall statistics
|
| 251 |
+
overall_stats = {
|
| 252 |
+
"average_latency": sum(latencies) / len(latencies),
|
| 253 |
+
"min_latency": min(latencies),
|
| 254 |
+
"max_latency": max(latencies),
|
| 255 |
+
"successful_queries": len(successful_results),
|
| 256 |
+
"total_queries": len(self.direct_results),
|
| 257 |
+
"success_rate": len(successful_results) / len(self.direct_results),
|
| 258 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies)
|
| 259 |
+
}
|
| 260 |
+
else:
|
| 261 |
+
category_stats = {cat: {"average_latency": 0.0, "query_count": 0, "target_compliance": 0.0}
|
| 262 |
+
for cat in ["diagnosis", "treatment", "mixed"]}
|
| 263 |
+
overall_stats = {
|
| 264 |
+
"average_latency": 0.0,
|
| 265 |
+
"successful_queries": 0,
|
| 266 |
+
"total_queries": len(self.direct_results),
|
| 267 |
+
"success_rate": 0.0,
|
| 268 |
+
"target_compliance": 0.0
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
return {
|
| 272 |
+
"category_results": category_stats,
|
| 273 |
+
"overall_results": overall_stats,
|
| 274 |
+
"model_type": "Med42-70B_direct",
|
| 275 |
+
"timestamp": datetime.now().isoformat()
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
def save_direct_llm_statistics(self, filename: str = None) -> str:
|
| 279 |
+
"""Save direct LLM statistics"""
|
| 280 |
+
stats = self.calculate_direct_llm_statistics()
|
| 281 |
+
|
| 282 |
+
if filename is None:
|
| 283 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 284 |
+
filename = f"direct_llm_statistics_{timestamp}.json"
|
| 285 |
+
|
| 286 |
+
results_dir = Path(__file__).parent / "results"
|
| 287 |
+
results_dir.mkdir(exist_ok=True)
|
| 288 |
+
filepath = results_dir / filename
|
| 289 |
+
|
| 290 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 291 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 292 |
+
|
| 293 |
+
print(f"π Direct LLM statistics saved to: {filepath}")
|
| 294 |
+
return str(filepath)
|
| 295 |
+
|
| 296 |
+
def save_direct_medical_outputs(self, filename: str = None) -> str:
|
| 297 |
+
"""Save medical outputs for LLM judge evaluation"""
|
| 298 |
+
if filename is None:
|
| 299 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 300 |
+
filename = f"medical_outputs_direct_{timestamp}.json"
|
| 301 |
+
|
| 302 |
+
results_dir = Path(__file__).parent / "results"
|
| 303 |
+
results_dir.mkdir(exist_ok=True)
|
| 304 |
+
filepath = results_dir / filename
|
| 305 |
+
|
| 306 |
+
output_data = {
|
| 307 |
+
"evaluation_metadata": {
|
| 308 |
+
"total_outputs": len(self.medical_outputs),
|
| 309 |
+
"categories": list(set(output['category'] for output in self.medical_outputs)),
|
| 310 |
+
"timestamp": datetime.now().isoformat(),
|
| 311 |
+
"model_type": "Med42-70B_direct"
|
| 312 |
+
},
|
| 313 |
+
"medical_outputs": self.medical_outputs
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 317 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
| 318 |
+
|
| 319 |
+
print(f"π Direct medical outputs saved to: {filepath}")
|
| 320 |
+
return str(filepath)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
# Independent execution interface
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
"""Independent direct LLM evaluation interface"""
|
| 326 |
+
|
| 327 |
+
print("π OnCall.ai Direct LLM Evaluator - Med42-70B Only")
|
| 328 |
+
|
| 329 |
+
if len(sys.argv) > 1:
|
| 330 |
+
query_file = sys.argv[1]
|
| 331 |
+
else:
|
| 332 |
+
query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
|
| 333 |
+
|
| 334 |
+
if not os.path.exists(query_file):
|
| 335 |
+
print(f"β Query file not found: {query_file}")
|
| 336 |
+
print("Usage: python direct_llm_evaluator.py [query_file.txt]")
|
| 337 |
+
sys.exit(1)
|
| 338 |
+
|
| 339 |
+
# Initialize evaluator
|
| 340 |
+
evaluator = DirectLLMEvaluator()
|
| 341 |
+
|
| 342 |
+
# Parse queries
|
| 343 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
| 344 |
+
|
| 345 |
+
if "error" in queries_by_category:
|
| 346 |
+
print(f"β Failed to parse queries: {queries_by_category['error']}")
|
| 347 |
+
sys.exit(1)
|
| 348 |
+
|
| 349 |
+
# Test direct LLM for each query
|
| 350 |
+
print(f"\nπ§ͺ Direct LLM Testing (No RAG Pipeline)")
|
| 351 |
+
|
| 352 |
+
for category, queries in queries_by_category.items():
|
| 353 |
+
if not queries:
|
| 354 |
+
continue
|
| 355 |
+
|
| 356 |
+
print(f"\nπ Testing {category.upper()} with direct Med42-70B:")
|
| 357 |
+
|
| 358 |
+
for i, query_info in enumerate(queries):
|
| 359 |
+
query_text = query_info['text']
|
| 360 |
+
|
| 361 |
+
# Direct LLM evaluation
|
| 362 |
+
result = evaluator.evaluate_direct_llm_query(query_text, category)
|
| 363 |
+
|
| 364 |
+
# Pause between queries
|
| 365 |
+
if i < len(queries) - 1:
|
| 366 |
+
print(f" β³ Pausing 5s before next query...")
|
| 367 |
+
time.sleep(5)
|
| 368 |
+
|
| 369 |
+
# Pause between categories
|
| 370 |
+
if category != list(queries_by_category.keys())[-1]:
|
| 371 |
+
print(f"\nβ³ Pausing 10s before next category...")
|
| 372 |
+
time.sleep(10)
|
| 373 |
+
|
| 374 |
+
# Save results
|
| 375 |
+
print(f"\nπ Generating direct LLM analysis...")
|
| 376 |
+
|
| 377 |
+
stats_path = evaluator.save_direct_llm_statistics()
|
| 378 |
+
outputs_path = evaluator.save_direct_medical_outputs()
|
| 379 |
+
|
| 380 |
+
# Print summary
|
| 381 |
+
stats = evaluator.calculate_direct_llm_statistics()
|
| 382 |
+
overall_results = stats['overall_results']
|
| 383 |
+
|
| 384 |
+
print(f"\nπ === DIRECT LLM EVALUATION SUMMARY ===")
|
| 385 |
+
print(f"Overall Performance:")
|
| 386 |
+
print(f" Average Latency: {overall_results['average_latency']:.2f}s")
|
| 387 |
+
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
| 388 |
+
print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
|
| 389 |
+
|
| 390 |
+
print(f"\nApplicable Metrics:")
|
| 391 |
+
print(f" β
Metric 1 (Latency): Measured")
|
| 392 |
+
print(f" β Metric 2 (Extraction): Not applicable - no extraction pipeline")
|
| 393 |
+
print(f" β Metric 3 (Relevance): Not applicable - no retrieval pipeline")
|
| 394 |
+
print(f" β Metric 4 (Coverage): Not applicable - no retrieval content")
|
| 395 |
+
print(f" π Metric 5 (Actionability): Requires LLM judge evaluation")
|
| 396 |
+
print(f" π Metric 6 (Evidence): Requires LLM judge evaluation")
|
| 397 |
+
|
| 398 |
+
print(f"\nβ
Direct LLM evaluation complete!")
|
| 399 |
+
print(f"π Statistics: {stats_path}")
|
| 400 |
+
print(f"π Medical Outputs: {outputs_path}")
|
| 401 |
+
print(f"\nπ‘ Next step: Run llm_judge_evaluator.py for metrics 5-6")
|
evaluation/extraction_chart_generator.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Extraction Chart Generator
|
| 4 |
+
============================================
|
| 5 |
+
|
| 6 |
+
Generates extraction success rate charts from saved statistics.
|
| 7 |
+
Reads JSON files produced by comprehensive evaluator.
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import glob
|
| 20 |
+
|
| 21 |
+
# Visualization imports
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ExtractionChartGenerator:
|
| 29 |
+
"""Generate charts for condition extraction metrics"""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
"""Initialize chart generator"""
|
| 33 |
+
print("π Initializing Extraction Chart Generator...")
|
| 34 |
+
plt.style.use('default')
|
| 35 |
+
sns.set_palette("husl")
|
| 36 |
+
print("β
Chart Generator ready")
|
| 37 |
+
|
| 38 |
+
def load_latest_extraction_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 39 |
+
"""Load the most recent extraction statistics file"""
|
| 40 |
+
if results_dir is None:
|
| 41 |
+
results_dir = Path(__file__).parent / "results"
|
| 42 |
+
|
| 43 |
+
pattern = str(results_dir / "extraction_statistics_*.json")
|
| 44 |
+
stat_files = glob.glob(pattern)
|
| 45 |
+
|
| 46 |
+
if not stat_files:
|
| 47 |
+
raise FileNotFoundError(f"No extraction statistics files found in {results_dir}")
|
| 48 |
+
|
| 49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 50 |
+
print(f"π Loading extraction statistics from: {latest_file}")
|
| 51 |
+
|
| 52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 53 |
+
stats = json.load(f)
|
| 54 |
+
|
| 55 |
+
return stats
|
| 56 |
+
|
| 57 |
+
def generate_extraction_charts(self, stats: Dict[str, Any]) -> str:
|
| 58 |
+
"""Generate extraction success rate analysis charts"""
|
| 59 |
+
try:
|
| 60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 61 |
+
fig.suptitle('OnCall.ai Extraction Success Rate Analysis', fontsize=16, fontweight='bold')
|
| 62 |
+
|
| 63 |
+
category_results = stats['category_results']
|
| 64 |
+
overall_results = stats['overall_results']
|
| 65 |
+
|
| 66 |
+
# Chart 1: Success Rate by Category
|
| 67 |
+
ax1 = axes[0, 0]
|
| 68 |
+
categories = []
|
| 69 |
+
success_rates = []
|
| 70 |
+
|
| 71 |
+
for category, cat_stats in category_results.items():
|
| 72 |
+
if cat_stats['total_count'] > 0:
|
| 73 |
+
categories.append(category.replace('_', ' ').title())
|
| 74 |
+
success_rates.append(cat_stats['success_rate'] * 100)
|
| 75 |
+
|
| 76 |
+
categories.append('Overall')
|
| 77 |
+
success_rates.append(overall_results['success_rate'] * 100)
|
| 78 |
+
|
| 79 |
+
bars = ax1.bar(categories, success_rates, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
| 80 |
+
ax1.set_title('Extraction Success Rate by Category', fontweight='bold')
|
| 81 |
+
ax1.set_ylabel('Success Rate (%)')
|
| 82 |
+
ax1.set_xlabel('Query Category')
|
| 83 |
+
ax1.grid(True, alpha=0.3)
|
| 84 |
+
|
| 85 |
+
# Add target line
|
| 86 |
+
ax1.axhline(y=80, color='red', linestyle='--', alpha=0.7, label='80% Target')
|
| 87 |
+
ax1.legend()
|
| 88 |
+
|
| 89 |
+
# Add value labels
|
| 90 |
+
for bar, rate in zip(bars, success_rates):
|
| 91 |
+
height = bar.get_height()
|
| 92 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
|
| 93 |
+
f'{rate:.1f}%', ha='center', va='bottom', fontweight='bold')
|
| 94 |
+
|
| 95 |
+
# Chart 2: Success Count
|
| 96 |
+
ax2 = axes[0, 1]
|
| 97 |
+
successful_counts = []
|
| 98 |
+
total_counts = []
|
| 99 |
+
|
| 100 |
+
for category, cat_stats in category_results.items():
|
| 101 |
+
if cat_stats['total_count'] > 0:
|
| 102 |
+
successful_counts.append(cat_stats['successful_count'])
|
| 103 |
+
total_counts.append(cat_stats['total_count'])
|
| 104 |
+
|
| 105 |
+
successful_counts.append(overall_results['successful_count'])
|
| 106 |
+
total_counts.append(overall_results['total_count'])
|
| 107 |
+
|
| 108 |
+
x = np.arange(len(categories))
|
| 109 |
+
width = 0.35
|
| 110 |
+
|
| 111 |
+
ax2.bar(x - width/2, successful_counts, width, label='Successful', alpha=0.8)
|
| 112 |
+
ax2.bar(x + width/2, total_counts, width, label='Total', alpha=0.8)
|
| 113 |
+
|
| 114 |
+
ax2.set_title('Extraction Success Count', fontweight='bold')
|
| 115 |
+
ax2.set_ylabel('Query Count')
|
| 116 |
+
ax2.set_xlabel('Query Category')
|
| 117 |
+
ax2.set_xticks(x)
|
| 118 |
+
ax2.set_xticklabels(categories)
|
| 119 |
+
ax2.legend()
|
| 120 |
+
ax2.grid(True, alpha=0.3)
|
| 121 |
+
|
| 122 |
+
# Chart 3: Statistical Summary Table
|
| 123 |
+
ax3 = axes[1, 0]
|
| 124 |
+
ax3.axis('tight')
|
| 125 |
+
ax3.axis('off')
|
| 126 |
+
|
| 127 |
+
table_data = []
|
| 128 |
+
headers = ['Category', 'Success Rate', 'Success/Total', 'Avg Time (s)', 'Target Met']
|
| 129 |
+
|
| 130 |
+
for category, cat_stats in category_results.items():
|
| 131 |
+
if cat_stats['total_count'] > 0:
|
| 132 |
+
table_data.append([
|
| 133 |
+
category.replace('_', ' ').title(),
|
| 134 |
+
f"{cat_stats['success_rate']:.1%}",
|
| 135 |
+
f"{cat_stats['successful_count']}/{cat_stats['total_count']}",
|
| 136 |
+
f"{cat_stats['average_extraction_time']:.3f}",
|
| 137 |
+
'β
' if cat_stats.get('meets_threshold', False) else 'β'
|
| 138 |
+
])
|
| 139 |
+
|
| 140 |
+
table_data.append([
|
| 141 |
+
'Overall',
|
| 142 |
+
f"{overall_results['success_rate']:.1%}",
|
| 143 |
+
f"{overall_results['successful_count']}/{overall_results['total_count']}",
|
| 144 |
+
'-',
|
| 145 |
+
'β
' if overall_results.get('target_compliance', False) else 'β'
|
| 146 |
+
])
|
| 147 |
+
|
| 148 |
+
if table_data:
|
| 149 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
| 150 |
+
cellLoc='center', loc='center')
|
| 151 |
+
table.auto_set_font_size(False)
|
| 152 |
+
table.set_fontsize(10)
|
| 153 |
+
table.scale(1, 2)
|
| 154 |
+
|
| 155 |
+
# Style header
|
| 156 |
+
for i in range(len(headers)):
|
| 157 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 158 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
| 159 |
+
|
| 160 |
+
ax3.set_title('Extraction Statistics Summary', fontweight='bold', pad=20)
|
| 161 |
+
|
| 162 |
+
# Chart 4: Performance visualization
|
| 163 |
+
ax4 = axes[1, 1]
|
| 164 |
+
|
| 165 |
+
# Simple performance indicator
|
| 166 |
+
overall_rate = overall_results['success_rate'] * 100
|
| 167 |
+
colors = ['#d62728' if overall_rate < 80 else '#2ca02c']
|
| 168 |
+
|
| 169 |
+
wedges, texts, autotexts = ax4.pie([overall_rate, 100-overall_rate],
|
| 170 |
+
labels=['Successful', 'Failed'],
|
| 171 |
+
autopct='%1.1f%%',
|
| 172 |
+
colors=['#2ca02c', '#ffcccc'],
|
| 173 |
+
startangle=90)
|
| 174 |
+
|
| 175 |
+
ax4.set_title(f'Overall Extraction Success\n{overall_rate:.1f}% Success Rate', fontweight='bold')
|
| 176 |
+
|
| 177 |
+
plt.tight_layout()
|
| 178 |
+
|
| 179 |
+
# Save chart
|
| 180 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 181 |
+
chart_filename = f"extraction_analysis_charts_{timestamp}.png"
|
| 182 |
+
|
| 183 |
+
results_dir = Path(__file__).parent / "results"
|
| 184 |
+
results_dir.mkdir(exist_ok=True)
|
| 185 |
+
chart_path = results_dir / chart_filename
|
| 186 |
+
|
| 187 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
| 188 |
+
plt.close()
|
| 189 |
+
|
| 190 |
+
print(f"π Extraction charts saved to: {chart_path}")
|
| 191 |
+
return str(chart_path)
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
print(f"β Extraction chart generation failed: {e}")
|
| 195 |
+
return ""
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
if __name__ == "__main__":
|
| 199 |
+
"""Independent extraction chart generation"""
|
| 200 |
+
|
| 201 |
+
print("π OnCall.ai Extraction Chart Generator")
|
| 202 |
+
|
| 203 |
+
chart_gen = ExtractionChartGenerator()
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
stats = chart_gen.load_latest_extraction_statistics()
|
| 207 |
+
chart_path = chart_gen.generate_extraction_charts(stats)
|
| 208 |
+
|
| 209 |
+
print(f"\nβ
Extraction chart generation complete!")
|
| 210 |
+
print(f"π Charts saved to: {chart_path}")
|
| 211 |
+
|
| 212 |
+
except FileNotFoundError as e:
|
| 213 |
+
print(f"β {e}")
|
| 214 |
+
print("π‘ Please run latency_evaluator.py first to generate extraction statistics data")
|
| 215 |
+
except Exception as e:
|
| 216 |
+
print(f"β Chart generation failed: {e}")
|
evaluation/llm_judge_evaluator.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - LLM Judge Evaluator (Metrics 5-6)
|
| 4 |
+
====================================================
|
| 5 |
+
|
| 6 |
+
Uses Llama3-70B as third-party judge to evaluate medical advice quality.
|
| 7 |
+
Batch evaluation strategy: 1 call evaluates all queries for maximum efficiency.
|
| 8 |
+
|
| 9 |
+
Metrics evaluated:
|
| 10 |
+
5. Clinical Actionability (θ¨εΊε―ζδ½ζ§)
|
| 11 |
+
6. Clinical Evidence Quality (θ¨εΊθζεθ³ͺ)
|
| 12 |
+
|
| 13 |
+
Author: YanBo Chen
|
| 14 |
+
Date: 2025-08-04
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
from typing import Dict, List, Any, Tuple
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
import glob
|
| 24 |
+
import re
|
| 25 |
+
|
| 26 |
+
# Add project path
|
| 27 |
+
current_dir = Path(__file__).parent
|
| 28 |
+
project_root = current_dir.parent
|
| 29 |
+
src_dir = project_root / "src"
|
| 30 |
+
sys.path.insert(0, str(src_dir))
|
| 31 |
+
|
| 32 |
+
# Import LLM client for judge evaluation
|
| 33 |
+
try:
|
| 34 |
+
from llm_clients import llm_Med42_70BClient # Temporarily use Med42 as placeholder
|
| 35 |
+
# TODO: Replace with actual Llama3-70B client when available
|
| 36 |
+
except ImportError as e:
|
| 37 |
+
print(f"β Import failed: {e}")
|
| 38 |
+
print("Please ensure running from project root directory")
|
| 39 |
+
sys.exit(1)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class LLMJudgeEvaluator:
|
| 43 |
+
"""LLM judge evaluator using batch evaluation strategy"""
|
| 44 |
+
|
| 45 |
+
def __init__(self):
|
| 46 |
+
"""Initialize judge LLM client"""
|
| 47 |
+
print("π§ Initializing LLM Judge Evaluator...")
|
| 48 |
+
|
| 49 |
+
# TODO: Replace with actual Llama3-70B client
|
| 50 |
+
# For now, using Med42 as placeholder
|
| 51 |
+
self.judge_llm = llm_Med42_70BClient()
|
| 52 |
+
print("β οΈ Note: Using Med42 as placeholder for Llama3-70B judge")
|
| 53 |
+
|
| 54 |
+
self.evaluation_results = []
|
| 55 |
+
|
| 56 |
+
print("β
LLM Judge Evaluator initialization complete")
|
| 57 |
+
|
| 58 |
+
def load_medical_outputs(self, filepath: str) -> List[Dict[str, Any]]:
|
| 59 |
+
"""Load medical outputs from file"""
|
| 60 |
+
print(f"π Loading medical outputs from: {filepath}")
|
| 61 |
+
|
| 62 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 63 |
+
data = json.load(f)
|
| 64 |
+
|
| 65 |
+
medical_outputs = data.get('medical_outputs', [])
|
| 66 |
+
print(f"π Loaded {len(medical_outputs)} medical outputs")
|
| 67 |
+
|
| 68 |
+
return medical_outputs
|
| 69 |
+
|
| 70 |
+
def find_latest_medical_outputs(self, model_type: str = "rag") -> str:
|
| 71 |
+
"""Find the latest medical outputs file"""
|
| 72 |
+
results_dir = Path(__file__).parent / "results"
|
| 73 |
+
|
| 74 |
+
if model_type == "rag":
|
| 75 |
+
pattern = str(results_dir / "medical_outputs_*.json")
|
| 76 |
+
else: # direct
|
| 77 |
+
pattern = str(results_dir / "medical_outputs_direct_*.json")
|
| 78 |
+
|
| 79 |
+
output_files = glob.glob(pattern)
|
| 80 |
+
|
| 81 |
+
if not output_files:
|
| 82 |
+
raise FileNotFoundError(f"No medical outputs files found for {model_type} model")
|
| 83 |
+
|
| 84 |
+
latest_file = max(output_files, key=os.path.getmtime)
|
| 85 |
+
print(f"π Found latest medical outputs: {latest_file}")
|
| 86 |
+
|
| 87 |
+
return latest_file
|
| 88 |
+
|
| 89 |
+
def create_batch_evaluation_prompt(self, medical_outputs: List[Dict[str, Any]]) -> str:
|
| 90 |
+
"""
|
| 91 |
+
Create batch evaluation prompt for all queries at once
|
| 92 |
+
|
| 93 |
+
Maximum efficiency: 1 LLM call evaluates all queries
|
| 94 |
+
"""
|
| 95 |
+
prompt_parts = [
|
| 96 |
+
"You are a medical expert evaluating clinical advice quality.",
|
| 97 |
+
"Please evaluate each medical advice response on TWO criteria:",
|
| 98 |
+
"",
|
| 99 |
+
"CRITERIA:",
|
| 100 |
+
"1. Clinical Actionability (1-10): Can healthcare providers immediately act on this advice?",
|
| 101 |
+
"2. Clinical Evidence Quality (1-10): Is the advice evidence-based and follows medical standards?",
|
| 102 |
+
"",
|
| 103 |
+
"QUERIES TO EVALUATE:",
|
| 104 |
+
""
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
# Add each query and advice
|
| 108 |
+
for i, output in enumerate(medical_outputs, 1):
|
| 109 |
+
query = output.get('query', '')
|
| 110 |
+
advice = output.get('medical_advice', '')
|
| 111 |
+
category = output.get('category', 'unknown')
|
| 112 |
+
|
| 113 |
+
prompt_parts.extend([
|
| 114 |
+
f"=== QUERY {i} ({category.upper()}) ===",
|
| 115 |
+
f"Patient Query: {query}",
|
| 116 |
+
f"Medical Advice: {advice}",
|
| 117 |
+
""
|
| 118 |
+
])
|
| 119 |
+
|
| 120 |
+
prompt_parts.extend([
|
| 121 |
+
"RESPONSE FORMAT (provide exactly this format):",
|
| 122 |
+
""
|
| 123 |
+
])
|
| 124 |
+
|
| 125 |
+
# Add response format template
|
| 126 |
+
for i in range(1, len(medical_outputs) + 1):
|
| 127 |
+
prompt_parts.append(f"Query {i}: Actionability=X, Evidence=Y")
|
| 128 |
+
|
| 129 |
+
prompt_parts.extend([
|
| 130 |
+
"",
|
| 131 |
+
"Replace X and Y with numeric scores 1-10.",
|
| 132 |
+
"Provide only the scores in the exact format above."
|
| 133 |
+
])
|
| 134 |
+
|
| 135 |
+
return "\n".join(prompt_parts)
|
| 136 |
+
|
| 137 |
+
def parse_batch_evaluation_response(self, response: str, medical_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 138 |
+
"""Parse batch evaluation response into individual scores"""
|
| 139 |
+
results = []
|
| 140 |
+
|
| 141 |
+
# Parse response format: "Query 1: Actionability=8, Evidence=7"
|
| 142 |
+
lines = response.strip().split('\n')
|
| 143 |
+
|
| 144 |
+
for i, line in enumerate(lines):
|
| 145 |
+
line = line.strip()
|
| 146 |
+
if not line:
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
+
# Try to match pattern: "Query X: Actionability=Y, Evidence=Z"
|
| 150 |
+
match = re.match(r'Query\s+(\d+):\s*Actionability\s*=\s*(\d+)\s*,\s*Evidence\s*=\s*(\d+)', line, re.IGNORECASE)
|
| 151 |
+
|
| 152 |
+
if match:
|
| 153 |
+
query_num = int(match.group(1)) - 1 # Convert to 0-based index
|
| 154 |
+
actionability_score = int(match.group(2))
|
| 155 |
+
evidence_score = int(match.group(3))
|
| 156 |
+
|
| 157 |
+
if query_num < len(medical_outputs):
|
| 158 |
+
output = medical_outputs[query_num]
|
| 159 |
+
|
| 160 |
+
result = {
|
| 161 |
+
"query": output.get('query', ''),
|
| 162 |
+
"category": output.get('category', 'unknown'),
|
| 163 |
+
"model_type": output.get('model_type', 'unknown'),
|
| 164 |
+
"medical_advice": output.get('medical_advice', ''),
|
| 165 |
+
|
| 166 |
+
# Metric 5: Clinical Actionability
|
| 167 |
+
"actionability_score": actionability_score / 10.0, # Normalize to 0-1
|
| 168 |
+
"actionability_raw": actionability_score,
|
| 169 |
+
|
| 170 |
+
# Metric 6: Clinical Evidence Quality
|
| 171 |
+
"evidence_score": evidence_score / 10.0, # Normalize to 0-1
|
| 172 |
+
"evidence_raw": evidence_score,
|
| 173 |
+
|
| 174 |
+
"evaluation_success": True,
|
| 175 |
+
"timestamp": datetime.now().isoformat()
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
results.append(result)
|
| 179 |
+
|
| 180 |
+
return results
|
| 181 |
+
|
| 182 |
+
def evaluate_batch_medical_outputs(self, medical_outputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 183 |
+
"""
|
| 184 |
+
Batch evaluate all medical outputs using single LLM call
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
medical_outputs: List of medical advice outputs to evaluate
|
| 188 |
+
"""
|
| 189 |
+
print(f"π§ Batch evaluating {len(medical_outputs)} medical outputs...")
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
# Create batch evaluation prompt
|
| 193 |
+
batch_prompt = self.create_batch_evaluation_prompt(medical_outputs)
|
| 194 |
+
|
| 195 |
+
print(f"π Batch prompt created ({len(batch_prompt)} characters)")
|
| 196 |
+
print(f"π Calling judge LLM for batch evaluation...")
|
| 197 |
+
|
| 198 |
+
# Single LLM call for all evaluations
|
| 199 |
+
eval_start = time.time()
|
| 200 |
+
response = self.judge_llm.generate_completion(batch_prompt)
|
| 201 |
+
eval_time = time.time() - eval_start
|
| 202 |
+
|
| 203 |
+
# Extract response text
|
| 204 |
+
response_text = response.get('content', '') if isinstance(response, dict) else str(response)
|
| 205 |
+
|
| 206 |
+
print(f"β
Judge LLM completed batch evaluation in {eval_time:.2f}s")
|
| 207 |
+
print(f"π Response length: {len(response_text)} characters")
|
| 208 |
+
|
| 209 |
+
# Parse batch response
|
| 210 |
+
parsed_results = self.parse_batch_evaluation_response(response_text, medical_outputs)
|
| 211 |
+
|
| 212 |
+
if len(parsed_results) != len(medical_outputs):
|
| 213 |
+
print(f"β οΈ Warning: Expected {len(medical_outputs)} results, got {len(parsed_results)}")
|
| 214 |
+
|
| 215 |
+
self.evaluation_results.extend(parsed_results)
|
| 216 |
+
|
| 217 |
+
print(f"π Successfully parsed {len(parsed_results)} evaluation results")
|
| 218 |
+
|
| 219 |
+
return parsed_results
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"β Batch evaluation failed: {e}")
|
| 223 |
+
|
| 224 |
+
# Create error results for all outputs
|
| 225 |
+
error_results = []
|
| 226 |
+
for output in medical_outputs:
|
| 227 |
+
error_result = {
|
| 228 |
+
"query": output.get('query', ''),
|
| 229 |
+
"category": output.get('category', 'unknown'),
|
| 230 |
+
"model_type": output.get('model_type', 'unknown'),
|
| 231 |
+
"actionability_score": 0.0,
|
| 232 |
+
"evidence_score": 0.0,
|
| 233 |
+
"evaluation_success": False,
|
| 234 |
+
"error": str(e),
|
| 235 |
+
"timestamp": datetime.now().isoformat()
|
| 236 |
+
}
|
| 237 |
+
error_results.append(error_result)
|
| 238 |
+
|
| 239 |
+
self.evaluation_results.extend(error_results)
|
| 240 |
+
return error_results
|
| 241 |
+
|
| 242 |
+
def calculate_judge_statistics(self) -> Dict[str, Any]:
|
| 243 |
+
"""Calculate statistics for LLM judge evaluation"""
|
| 244 |
+
successful_results = [r for r in self.evaluation_results if r.get('evaluation_success')]
|
| 245 |
+
|
| 246 |
+
if not successful_results:
|
| 247 |
+
return {
|
| 248 |
+
"category_results": {},
|
| 249 |
+
"overall_results": {
|
| 250 |
+
"average_actionability": 0.0,
|
| 251 |
+
"average_evidence": 0.0,
|
| 252 |
+
"successful_evaluations": 0,
|
| 253 |
+
"total_queries": len(self.evaluation_results)
|
| 254 |
+
},
|
| 255 |
+
"timestamp": datetime.now().isoformat()
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# Group by category
|
| 259 |
+
results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
|
| 260 |
+
|
| 261 |
+
for result in successful_results:
|
| 262 |
+
category = result.get('category', 'unknown')
|
| 263 |
+
if category in results_by_category:
|
| 264 |
+
results_by_category[category].append(result)
|
| 265 |
+
|
| 266 |
+
# Calculate category statistics
|
| 267 |
+
category_stats = {}
|
| 268 |
+
for category, results in results_by_category.items():
|
| 269 |
+
if results:
|
| 270 |
+
actionability_scores = [r['actionability_score'] for r in results]
|
| 271 |
+
evidence_scores = [r['evidence_score'] for r in results]
|
| 272 |
+
|
| 273 |
+
category_stats[category] = {
|
| 274 |
+
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
| 275 |
+
"average_evidence": sum(evidence_scores) / len(evidence_scores),
|
| 276 |
+
"query_count": len(results),
|
| 277 |
+
"actionability_target_met": (sum(actionability_scores) / len(actionability_scores)) >= 0.7,
|
| 278 |
+
"evidence_target_met": (sum(evidence_scores) / len(evidence_scores)) >= 0.75,
|
| 279 |
+
"individual_actionability_scores": actionability_scores,
|
| 280 |
+
"individual_evidence_scores": evidence_scores
|
| 281 |
+
}
|
| 282 |
+
else:
|
| 283 |
+
category_stats[category] = {
|
| 284 |
+
"average_actionability": 0.0,
|
| 285 |
+
"average_evidence": 0.0,
|
| 286 |
+
"query_count": 0,
|
| 287 |
+
"actionability_target_met": False,
|
| 288 |
+
"evidence_target_met": False,
|
| 289 |
+
"individual_actionability_scores": [],
|
| 290 |
+
"individual_evidence_scores": []
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
# Calculate overall statistics
|
| 294 |
+
all_actionability = [r['actionability_score'] for r in successful_results]
|
| 295 |
+
all_evidence = [r['evidence_score'] for r in successful_results]
|
| 296 |
+
|
| 297 |
+
overall_stats = {
|
| 298 |
+
"average_actionability": sum(all_actionability) / len(all_actionability),
|
| 299 |
+
"average_evidence": sum(all_evidence) / len(all_evidence),
|
| 300 |
+
"successful_evaluations": len(successful_results),
|
| 301 |
+
"total_queries": len(self.evaluation_results),
|
| 302 |
+
"actionability_target_met": (sum(all_actionability) / len(all_actionability)) >= 0.7,
|
| 303 |
+
"evidence_target_met": (sum(all_evidence) / len(all_evidence)) >= 0.75
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
return {
|
| 307 |
+
"category_results": category_stats,
|
| 308 |
+
"overall_results": overall_stats,
|
| 309 |
+
"timestamp": datetime.now().isoformat()
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
def save_judge_statistics(self, model_type: str, filename: str = None) -> str:
|
| 313 |
+
"""Save judge evaluation statistics"""
|
| 314 |
+
stats = self.calculate_judge_statistics()
|
| 315 |
+
|
| 316 |
+
if filename is None:
|
| 317 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 318 |
+
filename = f"judge_evaluation_{model_type}_{timestamp}.json"
|
| 319 |
+
|
| 320 |
+
results_dir = Path(__file__).parent / "results"
|
| 321 |
+
results_dir.mkdir(exist_ok=True)
|
| 322 |
+
filepath = results_dir / filename
|
| 323 |
+
|
| 324 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 325 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 326 |
+
|
| 327 |
+
print(f"π Judge evaluation statistics saved to: {filepath}")
|
| 328 |
+
return str(filepath)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
# Independent execution interface
|
| 332 |
+
if __name__ == "__main__":
|
| 333 |
+
"""Independent LLM judge evaluation interface"""
|
| 334 |
+
|
| 335 |
+
print("π§ OnCall.ai LLM Judge Evaluator - Metrics 5-6 Batch Evaluation")
|
| 336 |
+
|
| 337 |
+
if len(sys.argv) > 1 and sys.argv[1] in ['rag', 'direct']:
|
| 338 |
+
model_type = sys.argv[1]
|
| 339 |
+
else:
|
| 340 |
+
print("Usage: python llm_judge_evaluator.py [rag|direct]")
|
| 341 |
+
print(" rag - Evaluate RAG system medical outputs")
|
| 342 |
+
print(" direct - Evaluate direct LLM medical outputs")
|
| 343 |
+
sys.exit(1)
|
| 344 |
+
|
| 345 |
+
# Initialize evaluator
|
| 346 |
+
evaluator = LLMJudgeEvaluator()
|
| 347 |
+
|
| 348 |
+
try:
|
| 349 |
+
# Find and load latest medical outputs
|
| 350 |
+
outputs_file = evaluator.find_latest_medical_outputs(model_type)
|
| 351 |
+
medical_outputs = evaluator.load_medical_outputs(outputs_file)
|
| 352 |
+
|
| 353 |
+
if not medical_outputs:
|
| 354 |
+
print(f"β No medical outputs found in {outputs_file}")
|
| 355 |
+
sys.exit(1)
|
| 356 |
+
|
| 357 |
+
# Batch evaluate all outputs
|
| 358 |
+
print(f"\nπ§ͺ Batch LLM Judge Evaluation for {model_type.upper()} model")
|
| 359 |
+
print(f"π Evaluating {len(medical_outputs)} medical advice outputs")
|
| 360 |
+
print(f"π― Metrics: 5 (Actionability) + 6 (Evidence Quality)")
|
| 361 |
+
print(f"β‘ Strategy: Single batch call for maximum efficiency")
|
| 362 |
+
|
| 363 |
+
evaluation_results = evaluator.evaluate_batch_medical_outputs(medical_outputs)
|
| 364 |
+
|
| 365 |
+
# Save results
|
| 366 |
+
print(f"\nπ Generating judge evaluation analysis...")
|
| 367 |
+
stats_path = evaluator.save_judge_statistics(model_type)
|
| 368 |
+
|
| 369 |
+
# Print summary
|
| 370 |
+
stats = evaluator.calculate_judge_statistics()
|
| 371 |
+
overall_results = stats['overall_results']
|
| 372 |
+
category_results = stats['category_results']
|
| 373 |
+
|
| 374 |
+
print(f"\nπ === LLM JUDGE EVALUATION SUMMARY ({model_type.upper()}) ===")
|
| 375 |
+
print(f"Overall Performance:")
|
| 376 |
+
print(f" Average Actionability: {overall_results['average_actionability']:.3f} ({overall_results['average_actionability']*10:.1f}/10)")
|
| 377 |
+
print(f" Average Evidence Quality: {overall_results['average_evidence']:.3f} ({overall_results['average_evidence']*10:.1f}/10)")
|
| 378 |
+
print(f" Actionability Target (β₯7.0): {'β
Met' if overall_results['actionability_target_met'] else 'β Not Met'}")
|
| 379 |
+
print(f" Evidence Target (β₯7.5): {'β
Met' if overall_results['evidence_target_met'] else 'β Not Met'}")
|
| 380 |
+
|
| 381 |
+
print(f"\nCategory Breakdown:")
|
| 382 |
+
for category, cat_stats in category_results.items():
|
| 383 |
+
if cat_stats['query_count'] > 0:
|
| 384 |
+
print(f" {category.capitalize()}: "
|
| 385 |
+
f"Actionability={cat_stats['average_actionability']:.2f}, "
|
| 386 |
+
f"Evidence={cat_stats['average_evidence']:.2f} "
|
| 387 |
+
f"[{cat_stats['query_count']} queries]")
|
| 388 |
+
|
| 389 |
+
print(f"\nβ
LLM judge evaluation complete!")
|
| 390 |
+
print(f"π Statistics: {stats_path}")
|
| 391 |
+
print(f"β‘ Efficiency: {len(medical_outputs)} evaluations in 1 LLM call")
|
| 392 |
+
|
| 393 |
+
except FileNotFoundError as e:
|
| 394 |
+
print(f"β {e}")
|
| 395 |
+
print(f"π‘ Please run evaluator first:")
|
| 396 |
+
if model_type == "rag":
|
| 397 |
+
print(" python latency_evaluator.py pre_user_query_evaluate.txt")
|
| 398 |
+
else:
|
| 399 |
+
print(" python direct_llm_evaluator.py pre_user_query_evaluate.txt")
|
| 400 |
+
except Exception as e:
|
| 401 |
+
print(f"β Judge evaluation failed: {e}")
|
evaluation/relevance_chart_generator.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Relevance Chart Generator
|
| 4 |
+
============================================
|
| 5 |
+
|
| 6 |
+
Generates retrieval relevance charts from saved statistics.
|
| 7 |
+
Shows cosine similarity analysis and threshold compliance.
|
| 8 |
+
|
| 9 |
+
Author: YanBo Chen
|
| 10 |
+
Date: 2025-08-04
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import glob
|
| 20 |
+
|
| 21 |
+
# Visualization imports
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
import seaborn as sns
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RelevanceChartGenerator:
|
| 29 |
+
"""Generate charts for retrieval relevance metrics"""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
"""Initialize chart generator"""
|
| 33 |
+
print("π Initializing Relevance Chart Generator...")
|
| 34 |
+
plt.style.use('default')
|
| 35 |
+
sns.set_palette("husl")
|
| 36 |
+
print("β
Chart Generator ready")
|
| 37 |
+
|
| 38 |
+
def load_latest_relevance_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
| 39 |
+
"""Load the most recent relevance statistics file"""
|
| 40 |
+
if results_dir is None:
|
| 41 |
+
results_dir = Path(__file__).parent / "results"
|
| 42 |
+
|
| 43 |
+
pattern = str(results_dir / "relevance_statistics_*.json")
|
| 44 |
+
stat_files = glob.glob(pattern)
|
| 45 |
+
|
| 46 |
+
if not stat_files:
|
| 47 |
+
raise FileNotFoundError(f"No relevance statistics files found in {results_dir}")
|
| 48 |
+
|
| 49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
| 50 |
+
print(f"π Loading relevance statistics from: {latest_file}")
|
| 51 |
+
|
| 52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 53 |
+
stats = json.load(f)
|
| 54 |
+
|
| 55 |
+
return stats
|
| 56 |
+
|
| 57 |
+
def generate_relevance_charts(self, stats: Dict[str, Any]) -> str:
|
| 58 |
+
"""Generate relevance analysis charts"""
|
| 59 |
+
try:
|
| 60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 61 |
+
fig.suptitle('OnCall.ai Retrieval Relevance Analysis', fontsize=16, fontweight='bold')
|
| 62 |
+
|
| 63 |
+
category_results = stats['category_results']
|
| 64 |
+
overall_results = stats['overall_results']
|
| 65 |
+
|
| 66 |
+
# Chart 1: Average Relevance by Category
|
| 67 |
+
ax1 = axes[0, 0]
|
| 68 |
+
categories = []
|
| 69 |
+
avg_relevances = []
|
| 70 |
+
|
| 71 |
+
for category, cat_stats in category_results.items():
|
| 72 |
+
if cat_stats['successful_retrievals'] > 0:
|
| 73 |
+
categories.append(category.replace('_', ' ').title())
|
| 74 |
+
avg_relevances.append(cat_stats['average_relevance'])
|
| 75 |
+
|
| 76 |
+
categories.append('Overall')
|
| 77 |
+
avg_relevances.append(overall_results['average_relevance'])
|
| 78 |
+
|
| 79 |
+
bars = ax1.bar(categories, avg_relevances, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
| 80 |
+
ax1.set_title('Average Relevance Score by Category', fontweight='bold')
|
| 81 |
+
ax1.set_ylabel('Relevance Score (Cosine Similarity)')
|
| 82 |
+
ax1.set_xlabel('Query Category')
|
| 83 |
+
ax1.grid(True, alpha=0.3)
|
| 84 |
+
|
| 85 |
+
# Add threshold lines
|
| 86 |
+
ax1.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
| 87 |
+
ax1.axhline(y=0.25, color='red', linestyle='--', alpha=0.7, label='0.25 Target')
|
| 88 |
+
ax1.legend()
|
| 89 |
+
|
| 90 |
+
# Add value labels
|
| 91 |
+
for bar, relevance in zip(bars, avg_relevances):
|
| 92 |
+
height = bar.get_height()
|
| 93 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 94 |
+
f'{relevance:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 95 |
+
|
| 96 |
+
# Chart 2: Relevance Distribution
|
| 97 |
+
ax2 = axes[0, 1]
|
| 98 |
+
|
| 99 |
+
# Collect all individual relevance scores
|
| 100 |
+
all_scores = []
|
| 101 |
+
category_labels = []
|
| 102 |
+
|
| 103 |
+
for category, cat_stats in category_results.items():
|
| 104 |
+
if cat_stats.get('individual_relevance_scores'):
|
| 105 |
+
all_scores.extend(cat_stats['individual_relevance_scores'])
|
| 106 |
+
category_labels.extend([category] * len(cat_stats['individual_relevance_scores']))
|
| 107 |
+
|
| 108 |
+
if all_scores:
|
| 109 |
+
# Create histogram
|
| 110 |
+
ax2.hist(all_scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
|
| 111 |
+
ax2.axvline(x=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
| 112 |
+
ax2.axvline(x=0.25, color='red', linestyle='--', alpha=0.7, label='0.25 Target')
|
| 113 |
+
ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.3f}')
|
| 114 |
+
|
| 115 |
+
ax2.set_title('Relevance Score Distribution', fontweight='bold')
|
| 116 |
+
ax2.set_xlabel('Relevance Score')
|
| 117 |
+
ax2.set_ylabel('Frequency')
|
| 118 |
+
ax2.legend()
|
| 119 |
+
ax2.grid(True, alpha=0.3)
|
| 120 |
+
else:
|
| 121 |
+
ax2.text(0.5, 0.5, 'No relevance data available', ha='center', va='center', transform=ax2.transAxes)
|
| 122 |
+
ax2.set_title('Relevance Score Distribution', fontweight='bold')
|
| 123 |
+
|
| 124 |
+
# Chart 3: Statistical Summary Table
|
| 125 |
+
ax3 = axes[1, 0]
|
| 126 |
+
ax3.axis('tight')
|
| 127 |
+
ax3.axis('off')
|
| 128 |
+
|
| 129 |
+
table_data = []
|
| 130 |
+
headers = ['Category', 'Avg Relevance', 'Min/Max', 'Success/Total', 'Threshold Met']
|
| 131 |
+
|
| 132 |
+
for category, cat_stats in category_results.items():
|
| 133 |
+
if cat_stats['total_queries'] > 0:
|
| 134 |
+
table_data.append([
|
| 135 |
+
category.replace('_', ' ').title(),
|
| 136 |
+
f"{cat_stats['average_relevance']:.3f}",
|
| 137 |
+
f"{cat_stats['min_relevance']:.3f}/{cat_stats['max_relevance']:.3f}",
|
| 138 |
+
f"{cat_stats['successful_retrievals']}/{cat_stats['total_queries']}",
|
| 139 |
+
'β
' if cat_stats.get('meets_threshold', False) else 'β'
|
| 140 |
+
])
|
| 141 |
+
|
| 142 |
+
table_data.append([
|
| 143 |
+
'Overall',
|
| 144 |
+
f"{overall_results['average_relevance']:.3f}",
|
| 145 |
+
f"{overall_results['min_relevance']:.3f}/{overall_results['max_relevance']:.3f}",
|
| 146 |
+
f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
|
| 147 |
+
'β
' if overall_results.get('target_compliance', False) else 'β'
|
| 148 |
+
])
|
| 149 |
+
|
| 150 |
+
if table_data:
|
| 151 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
| 152 |
+
cellLoc='center', loc='center')
|
| 153 |
+
table.auto_set_font_size(False)
|
| 154 |
+
table.set_fontsize(10)
|
| 155 |
+
table.scale(1, 2)
|
| 156 |
+
|
| 157 |
+
# Style header
|
| 158 |
+
for i in range(len(headers)):
|
| 159 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 160 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
| 161 |
+
|
| 162 |
+
ax3.set_title('Relevance Statistics Summary', fontweight='bold', pad=20)
|
| 163 |
+
|
| 164 |
+
# Chart 4: Category Comparison Box Plot
|
| 165 |
+
ax4 = axes[1, 1]
|
| 166 |
+
|
| 167 |
+
box_data = []
|
| 168 |
+
box_labels = []
|
| 169 |
+
|
| 170 |
+
for category, cat_stats in category_results.items():
|
| 171 |
+
if cat_stats.get('individual_relevance_scores'):
|
| 172 |
+
box_data.append(cat_stats['individual_relevance_scores'])
|
| 173 |
+
box_labels.append(category.replace('_', ' ').title())
|
| 174 |
+
|
| 175 |
+
if box_data:
|
| 176 |
+
box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
|
| 177 |
+
colors = ['#1f77b4', '#ff7f0e', '#d62728']
|
| 178 |
+
for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
|
| 179 |
+
patch.set_facecolor(color)
|
| 180 |
+
patch.set_alpha(0.7)
|
| 181 |
+
|
| 182 |
+
ax4.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
| 183 |
+
ax4.axhline(y=0.25, color='red', linestyle='--', alpha=0.7, label='0.25 Target')
|
| 184 |
+
ax4.set_title('Relevance Distribution by Category', fontweight='bold')
|
| 185 |
+
ax4.set_ylabel('Relevance Score')
|
| 186 |
+
ax4.legend()
|
| 187 |
+
ax4.grid(True, alpha=0.3)
|
| 188 |
+
else:
|
| 189 |
+
ax4.text(0.5, 0.5, 'Insufficient data for box plot', ha='center', va='center', transform=ax4.transAxes)
|
| 190 |
+
ax4.set_title('Relevance Distribution by Category', fontweight='bold')
|
| 191 |
+
|
| 192 |
+
plt.tight_layout()
|
| 193 |
+
|
| 194 |
+
# Save chart
|
| 195 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 196 |
+
chart_filename = f"relevance_analysis_charts_{timestamp}.png"
|
| 197 |
+
|
| 198 |
+
results_dir = Path(__file__).parent / "results"
|
| 199 |
+
results_dir.mkdir(exist_ok=True)
|
| 200 |
+
chart_path = results_dir / chart_filename
|
| 201 |
+
|
| 202 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
| 203 |
+
plt.close()
|
| 204 |
+
|
| 205 |
+
print(f"π Relevance charts saved to: {chart_path}")
|
| 206 |
+
return str(chart_path)
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"β Relevance chart generation failed: {e}")
|
| 210 |
+
return ""
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
"""Independent relevance chart generation"""
|
| 215 |
+
|
| 216 |
+
print("π OnCall.ai Relevance Chart Generator")
|
| 217 |
+
|
| 218 |
+
chart_gen = RelevanceChartGenerator()
|
| 219 |
+
|
| 220 |
+
try:
|
| 221 |
+
stats = chart_gen.load_latest_relevance_statistics()
|
| 222 |
+
chart_path = chart_gen.generate_relevance_charts(stats)
|
| 223 |
+
|
| 224 |
+
print(f"\nβ
Relevance chart generation complete!")
|
| 225 |
+
print(f"π Charts saved to: {chart_path}")
|
| 226 |
+
|
| 227 |
+
except FileNotFoundError as e:
|
| 228 |
+
print(f"β {e}")
|
| 229 |
+
print("π‘ Please run latency_evaluator.py first to generate relevance statistics data")
|
| 230 |
+
except Exception as e:
|
| 231 |
+
print(f"β Chart generation failed: {e}")
|