Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
Β·
a2aaea2
1
Parent(s):
8e02192
Before Run the 1st Evalation: Add Precision & MRR Chart Generator and a sample test query
Browse files- Implemented a new Python script `precision_mrr_chart_generator.py` for generating comprehensive charts for Precision@K and Mean Reciprocal Rank (MRR) analysis from JSON results.
- The script includes functionalities for creating various visualizations such as comparison charts, heatmaps, and detailed statistics tables.
- Added a sample test query in `single_test_query.txt` for evaluation purposes.
- evaluation/latency_evaluator.py +74 -16
- evaluation/{latency_chart_generator.py β metric1_latency_chart_generator.py} +0 -0
- evaluation/{extraction_chart_generator.py β metric2_extraction_chart_generator.py} +0 -0
- evaluation/{relevance_chart_generator.py β metric3_relevance_chart_generator.py} +0 -0
- evaluation/{coverage_chart_generator.py β metric4_coverage_chart_generator.py} +0 -0
- evaluation/{llm_judge_evaluator.py β metric5_6_llm_judge_evaluator.py} +0 -0
- evaluation/metric7_8_precision_MRR.py +391 -0
- evaluation/{evaluation_instruction.md β old/evaluation_instruction.md} +0 -0
- evaluation/{evaluation_instruction_customization.md β old/evaluation_instruction_customization.md} +0 -0
- evaluation/precision_mrr_chart_generator.py +586 -0
- evaluation/single_test_query.txt +1 -0
evaluation/latency_evaluator.py
CHANGED
|
@@ -1,21 +1,48 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
OnCall.ai System - Comprehensive Evaluator (Metrics 1-
|
| 4 |
========================================================
|
| 5 |
|
| 6 |
-
Single execution to collect all metrics 1-
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
1. Total Latency (ηΈ½θηζι·) - Complete pipeline timing
|
| 10 |
-
2. Condition Extraction Success Rate (ζ’δ»Άζ½εζεη) - user_prompt.py success
|
| 11 |
-
3. Retrieval Relevance (ζͺ’η΄’ηΈιζ§) - cosine similarity from retrieval.py
|
| 12 |
-
4. Retrieval Coverage (ζͺ’η΄’θ¦θη) - advice utilization of retrieved content
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
Author: YanBo Chen
|
| 21 |
Date: 2025-08-04
|
|
@@ -320,6 +347,31 @@ class ComprehensiveEvaluator:
|
|
| 320 |
"timestamp": datetime.now().isoformat()
|
| 321 |
}
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# Store result
|
| 324 |
self.comprehensive_results.append(comprehensive_result)
|
| 325 |
|
|
@@ -386,8 +438,9 @@ class ComprehensiveEvaluator:
|
|
| 386 |
},
|
| 387 |
|
| 388 |
# Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
|
| 389 |
-
# are
|
| 390 |
-
#
|
|
|
|
| 391 |
|
| 392 |
"overall_success": False,
|
| 393 |
"status": status,
|
|
@@ -395,6 +448,9 @@ class ComprehensiveEvaluator:
|
|
| 395 |
"timestamp": datetime.now().isoformat()
|
| 396 |
}
|
| 397 |
|
|
|
|
|
|
|
|
|
|
| 398 |
self.comprehensive_results.append(failed_result)
|
| 399 |
return failed_result
|
| 400 |
|
|
@@ -741,8 +797,8 @@ if __name__ == "__main__":
|
|
| 741 |
if len(sys.argv) > 1:
|
| 742 |
query_file = sys.argv[1]
|
| 743 |
else:
|
| 744 |
-
# Default to evaluation/
|
| 745 |
-
query_file = Path(__file__).parent / "
|
| 746 |
|
| 747 |
if not os.path.exists(query_file):
|
| 748 |
print(f"β Query file not found: {query_file}")
|
|
@@ -829,7 +885,9 @@ if __name__ == "__main__":
|
|
| 829 |
print(f" π {metric_name.capitalize()}: {filepath}")
|
| 830 |
print(f" π Medical Outputs: {outputs_path}")
|
| 831 |
print(f" π Comprehensive Details: {details_path}")
|
| 832 |
-
print(f"\nπ‘ Next step: Run
|
|
|
|
|
|
|
| 833 |
print(f" python latency_chart_generator.py")
|
| 834 |
print(f" python extraction_chart_generator.py # (create separately)")
|
| 835 |
print(f" python relevance_chart_generator.py # (create separately)")
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
OnCall.ai System - Comprehensive Evaluator (Metrics 1-8)
|
| 4 |
========================================================
|
| 5 |
|
| 6 |
+
Single execution to collect all metrics 1-4 data from app.py pipeline.
|
| 7 |
+
Generates foundation data for metrics 5-8 evaluation in downstream processors.
|
| 8 |
|
| 9 |
+
COMPLETE METRICS OVERVIEW:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
PIPELINE PERFORMANCE METRICS (Collected by this evaluator):
|
| 12 |
+
1. Total Latency (ηΈ½θηζι·) - Complete pipeline processing time from query to response
|
| 13 |
+
2. Condition Extraction Success Rate (ζ’δ»Άζ½εζεη) - Success rate of user_prompt.py condition extraction
|
| 14 |
+
3. Retrieval Relevance (ζͺ’η΄’ηΈιζ§) - Average cosine similarity scores from retrieval.py results
|
| 15 |
+
4. Retrieval Coverage (ζͺ’η΄’θ¦θη) - Medical keyword utilization rate between retrieved content and generated advice
|
| 16 |
|
| 17 |
+
LLM JUDGE METRICS (Processed by metric5_6_llm_judge_evaluator.py):
|
| 18 |
+
5. Clinical Actionability (θ¨εΊε―ζδ½ζ§) - Third-party LLM evaluation of medical advice actionability (1-10 scale)
|
| 19 |
+
* Uses batch evaluation strategy with Llama3-70B as judge
|
| 20 |
+
* Measures: Can healthcare providers immediately act on this advice?
|
| 21 |
+
* Target threshold: β₯7.0/10 for acceptable actionability
|
| 22 |
+
|
| 23 |
+
6. Clinical Evidence Quality (θ¨εΊθζεθ³ͺ) - Third-party LLM evaluation of evidence-based quality (1-10 scale)
|
| 24 |
+
* Uses same batch evaluation call as metric 5 for efficiency
|
| 25 |
+
* Measures: Is the advice evidence-based and follows medical standards?
|
| 26 |
+
* Target threshold: β₯7.5/10 for acceptable evidence quality
|
| 27 |
+
|
| 28 |
+
RETRIEVAL PRECISION METRICS (Processed by metric7_8_precision_MRR.py):
|
| 29 |
+
7. Precision@K (ζͺ’η΄’η²Ύη’Ίη) - Proportion of relevant results in top-K retrieval results
|
| 30 |
+
* Uses adaptive threshold based on query complexity (0.15 for complex, 0.25 for simple queries)
|
| 31 |
+
* Query complexity determined by unique emergency keywords count (β₯4 = complex)
|
| 32 |
+
* Measures: relevant_results / total_retrieved_results
|
| 33 |
+
|
| 34 |
+
8. Mean Reciprocal Rank (εΉ³εεζΈζε) - Average reciprocal rank of first relevant result
|
| 35 |
+
* Uses same adaptive threshold as Precision@K
|
| 36 |
+
* Measures: 1 / rank_of_first_relevant_result (0 if no relevant results)
|
| 37 |
+
* Higher MRR indicates relevant results appear earlier in ranking
|
| 38 |
+
|
| 39 |
+
DATA FLOW ARCHITECTURE:
|
| 40 |
+
1. latency_evaluator.py β comprehensive_details_*.json (metrics 1-4 + pipeline data)
|
| 41 |
+
2. latency_evaluator.py β medical_outputs_*.json (medical advice for judge evaluation)
|
| 42 |
+
3. metric5_6_llm_judge_evaluator.py β judge_evaluation_*.json (metrics 5-6)
|
| 43 |
+
4. metric7_8_precision_MRR.py β precision_mrr_analysis_*.json (metrics 7-8)
|
| 44 |
+
|
| 45 |
+
Note: This evaluator focuses on metrics 1-4 collection. Metrics 5-8 require separate downstream evaluation.
|
| 46 |
|
| 47 |
Author: YanBo Chen
|
| 48 |
Date: 2025-08-04
|
|
|
|
| 347 |
"timestamp": datetime.now().isoformat()
|
| 348 |
}
|
| 349 |
|
| 350 |
+
# Validate data completeness for metrics 7-8 analysis
|
| 351 |
+
ready = True
|
| 352 |
+
data = comprehensive_result.get('pipeline_data', {})
|
| 353 |
+
|
| 354 |
+
# 1. Check retrieval results completeness for precision/MRR calculation
|
| 355 |
+
retr = data.get('retrieval_results', {}).get('processed_results', [])
|
| 356 |
+
if not retr or 'distance' not in retr[0]:
|
| 357 |
+
ready = False
|
| 358 |
+
|
| 359 |
+
# 2. Check condition extraction completeness for complexity analysis
|
| 360 |
+
cond = data.get('condition_result', {}).get('condition')
|
| 361 |
+
if not cond:
|
| 362 |
+
ready = False
|
| 363 |
+
|
| 364 |
+
# 3. Check overall execution status
|
| 365 |
+
if not comprehensive_result.get('overall_success', False):
|
| 366 |
+
ready = False
|
| 367 |
+
|
| 368 |
+
# 4. Check retrieval timing data completeness
|
| 369 |
+
if 'retrieval_time' not in comprehensive_result.get('relevance_metrics', {}):
|
| 370 |
+
ready = False
|
| 371 |
+
|
| 372 |
+
# Set metrics 7-8 readiness flag for downstream precision/MRR analysis
|
| 373 |
+
comprehensive_result['precision_mrr_ready'] = ready
|
| 374 |
+
|
| 375 |
# Store result
|
| 376 |
self.comprehensive_results.append(comprehensive_result)
|
| 377 |
|
|
|
|
| 438 |
},
|
| 439 |
|
| 440 |
# Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
|
| 441 |
+
# are collected by metric5_6_llm_judge_evaluator.py using medical_outputs
|
| 442 |
+
# Metrics 7-8 (Precision@K & MRR) are collected by metric7_8_precision_MRR.py
|
| 443 |
+
# using comprehensive_details pipeline data
|
| 444 |
|
| 445 |
"overall_success": False,
|
| 446 |
"status": status,
|
|
|
|
| 448 |
"timestamp": datetime.now().isoformat()
|
| 449 |
}
|
| 450 |
|
| 451 |
+
# For failed results, precision/MRR analysis data is not ready
|
| 452 |
+
failed_result['precision_mrr_ready'] = False
|
| 453 |
+
|
| 454 |
self.comprehensive_results.append(failed_result)
|
| 455 |
return failed_result
|
| 456 |
|
|
|
|
| 797 |
if len(sys.argv) > 1:
|
| 798 |
query_file = sys.argv[1]
|
| 799 |
else:
|
| 800 |
+
# Default to evaluation/single_test_query.txt for initial testing
|
| 801 |
+
query_file = Path(__file__).parent / "single_test_query.txt"
|
| 802 |
|
| 803 |
if not os.path.exists(query_file):
|
| 804 |
print(f"β Query file not found: {query_file}")
|
|
|
|
| 885 |
print(f" π {metric_name.capitalize()}: {filepath}")
|
| 886 |
print(f" π Medical Outputs: {outputs_path}")
|
| 887 |
print(f" π Comprehensive Details: {details_path}")
|
| 888 |
+
print(f"\nπ‘ Next step: Run downstream evaluators for metrics 5-8")
|
| 889 |
+
print(f" python metric5_6_llm_judge_evaluator.py rag")
|
| 890 |
+
print(f" python metric7_8_precision_MRR.py {details_path}")
|
| 891 |
print(f" python latency_chart_generator.py")
|
| 892 |
print(f" python extraction_chart_generator.py # (create separately)")
|
| 893 |
print(f" python relevance_chart_generator.py # (create separately)")
|
evaluation/{latency_chart_generator.py β metric1_latency_chart_generator.py}
RENAMED
|
File without changes
|
evaluation/{extraction_chart_generator.py β metric2_extraction_chart_generator.py}
RENAMED
|
File without changes
|
evaluation/{relevance_chart_generator.py β metric3_relevance_chart_generator.py}
RENAMED
|
File without changes
|
evaluation/{coverage_chart_generator.py β metric4_coverage_chart_generator.py}
RENAMED
|
File without changes
|
evaluation/{llm_judge_evaluator.py β metric5_6_llm_judge_evaluator.py}
RENAMED
|
File without changes
|
evaluation/metric7_8_precision_MRR.py
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
|
| 4 |
+
========================================================
|
| 5 |
+
|
| 6 |
+
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
| 7 |
+
using data collected from latency_evaluator.py comprehensive evaluation.
|
| 8 |
+
|
| 9 |
+
METRICS CALCULATED:
|
| 10 |
+
7. Precision@K (ζͺ’η΄’η²Ύη’Ίη) - Proportion of relevant results in top-K retrieval
|
| 11 |
+
8. Mean Reciprocal Rank (εΉ³εεζΈζε) - Average reciprocal rank of first relevant result
|
| 12 |
+
|
| 13 |
+
DESIGN PRINCIPLE:
|
| 14 |
+
- Reuses comprehensive_details_*.json from latency_evaluator.py
|
| 15 |
+
- Implements adaptive threshold based on query complexity
|
| 16 |
+
- Query complexity determined by actual matched emergency keywords count
|
| 17 |
+
- No additional LLM calls required
|
| 18 |
+
|
| 19 |
+
Author: YanBo Chen
|
| 20 |
+
Date: 2025-08-04
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import json
|
| 24 |
+
import os
|
| 25 |
+
import sys
|
| 26 |
+
from typing import Dict, List, Any, Set
|
| 27 |
+
from datetime import datetime
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
import re
|
| 30 |
+
import statistics
|
| 31 |
+
|
| 32 |
+
class PrecisionMRRAnalyzer:
|
| 33 |
+
"""Specialized analyzer for metrics 7-8 using existing comprehensive evaluation data"""
|
| 34 |
+
|
| 35 |
+
def __init__(self):
|
| 36 |
+
"""Initialize analyzer"""
|
| 37 |
+
print("π§ Initializing Precision & MRR Analyzer...")
|
| 38 |
+
self.analysis_results = []
|
| 39 |
+
print("β
Analyzer initialization complete")
|
| 40 |
+
|
| 41 |
+
def load_comprehensive_data(self, filepath: str) -> List[Dict]:
|
| 42 |
+
"""
|
| 43 |
+
Load comprehensive evaluation data from latency_evaluator.py output
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
filepath: Path to comprehensive_details_*.json file
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
List of comprehensive evaluation results
|
| 50 |
+
"""
|
| 51 |
+
try:
|
| 52 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 53 |
+
data = json.load(f)
|
| 54 |
+
|
| 55 |
+
comprehensive_results = data.get('comprehensive_results', [])
|
| 56 |
+
|
| 57 |
+
print(f"π Loaded {len(comprehensive_results)} comprehensive evaluation results")
|
| 58 |
+
print(f"π Ready for precision/MRR analysis: {sum(1 for r in comprehensive_results if r.get('precision_mrr_ready'))}")
|
| 59 |
+
|
| 60 |
+
return comprehensive_results
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"β Failed to load comprehensive data: {e}")
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
def _is_complex_query(self, query: str, processed_results: List[Dict]) -> bool:
|
| 67 |
+
"""
|
| 68 |
+
Determine query complexity based on actual matched emergency keywords
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
query: Original query text
|
| 72 |
+
processed_results: Retrieval results with matched keywords
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
True if query is complex (should use lenient threshold)
|
| 76 |
+
"""
|
| 77 |
+
# Collect unique emergency keywords actually found in retrieval results
|
| 78 |
+
unique_emergency_keywords = set()
|
| 79 |
+
|
| 80 |
+
for result in processed_results:
|
| 81 |
+
if result.get('type') == 'emergency':
|
| 82 |
+
matched_keywords = result.get('matched', '')
|
| 83 |
+
if matched_keywords:
|
| 84 |
+
keywords = [kw.strip() for kw in matched_keywords.split('|') if kw.strip()]
|
| 85 |
+
unique_emergency_keywords.update(keywords)
|
| 86 |
+
|
| 87 |
+
keyword_count = len(unique_emergency_keywords)
|
| 88 |
+
|
| 89 |
+
# Business logic: 4+ different emergency keywords indicate complex case
|
| 90 |
+
is_complex = keyword_count >= 4
|
| 91 |
+
|
| 92 |
+
print(f" π§ Query complexity: {'Complex' if is_complex else 'Simple'} ({keyword_count} emergency keywords)")
|
| 93 |
+
print(f" π Found keywords: {', '.join(list(unique_emergency_keywords)[:5])}")
|
| 94 |
+
|
| 95 |
+
return is_complex
|
| 96 |
+
|
| 97 |
+
def calculate_precision_mrr_single(self, query_data: Dict) -> Dict[str, Any]:
|
| 98 |
+
"""
|
| 99 |
+
Calculate precision@K and MRR for single query
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
query_data: Single query's comprehensive evaluation result
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
Precision and MRR metrics for this query
|
| 106 |
+
"""
|
| 107 |
+
query = query_data['query']
|
| 108 |
+
category = query_data['category']
|
| 109 |
+
|
| 110 |
+
# Extract processed results from pipeline data
|
| 111 |
+
pipeline_data = query_data.get('pipeline_data', {})
|
| 112 |
+
retrieval_results = pipeline_data.get('retrieval_results', {})
|
| 113 |
+
processed_results = retrieval_results.get('processed_results', [])
|
| 114 |
+
|
| 115 |
+
print(f"π Analyzing precision/MRR for: {query[:50]}...")
|
| 116 |
+
print(f"π Category: {category}, Results: {len(processed_results)}")
|
| 117 |
+
|
| 118 |
+
if not processed_results:
|
| 119 |
+
return self._create_empty_precision_mrr_result(query, category)
|
| 120 |
+
|
| 121 |
+
# Step 1: Determine query complexity
|
| 122 |
+
is_complex = self._is_complex_query(query, processed_results)
|
| 123 |
+
|
| 124 |
+
# Step 2: Choose adaptive threshold
|
| 125 |
+
threshold = 0.15 if is_complex else 0.25
|
| 126 |
+
|
| 127 |
+
print(f" π― Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
| 128 |
+
|
| 129 |
+
# Step 3: Calculate relevance scores (1 - distance)
|
| 130 |
+
relevance_scores = []
|
| 131 |
+
for result in processed_results:
|
| 132 |
+
distance = result.get('distance', 1.0)
|
| 133 |
+
relevance = 1.0 - distance
|
| 134 |
+
relevance_scores.append(relevance)
|
| 135 |
+
|
| 136 |
+
# Step 4: Calculate Precision@K
|
| 137 |
+
relevant_count = sum(1 for score in relevance_scores if score >= threshold)
|
| 138 |
+
precision_at_k = relevant_count / len(processed_results)
|
| 139 |
+
|
| 140 |
+
# Step 5: Calculate MRR
|
| 141 |
+
first_relevant_rank = None
|
| 142 |
+
for i, score in enumerate(relevance_scores, 1):
|
| 143 |
+
if score >= threshold:
|
| 144 |
+
first_relevant_rank = i
|
| 145 |
+
break
|
| 146 |
+
|
| 147 |
+
mrr_score = (1.0 / first_relevant_rank) if first_relevant_rank else 0.0
|
| 148 |
+
|
| 149 |
+
# Detailed analysis
|
| 150 |
+
result = {
|
| 151 |
+
"query": query,
|
| 152 |
+
"category": category,
|
| 153 |
+
"query_complexity": "complex" if is_complex else "simple",
|
| 154 |
+
"threshold_used": threshold,
|
| 155 |
+
|
| 156 |
+
# Metric 7: Precision@K
|
| 157 |
+
"precision_at_k": precision_at_k,
|
| 158 |
+
"relevant_count": relevant_count,
|
| 159 |
+
"total_results": len(processed_results),
|
| 160 |
+
|
| 161 |
+
# Metric 8: MRR
|
| 162 |
+
"mrr_score": mrr_score,
|
| 163 |
+
"first_relevant_rank": first_relevant_rank,
|
| 164 |
+
|
| 165 |
+
# Supporting data
|
| 166 |
+
"relevance_scores": relevance_scores,
|
| 167 |
+
"avg_relevance": sum(relevance_scores) / len(relevance_scores),
|
| 168 |
+
"max_relevance": max(relevance_scores),
|
| 169 |
+
"min_relevance": min(relevance_scores),
|
| 170 |
+
|
| 171 |
+
"timestamp": datetime.now().isoformat()
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
print(f" π Precision@{len(processed_results)}: {precision_at_k:.3f} ({relevant_count}/{len(processed_results)} relevant)")
|
| 175 |
+
print(f" π MRR: {mrr_score:.3f} (first relevant at rank {first_relevant_rank})")
|
| 176 |
+
|
| 177 |
+
return result
|
| 178 |
+
|
| 179 |
+
def _create_empty_precision_mrr_result(self, query: str, category: str) -> Dict[str, Any]:
|
| 180 |
+
"""Create empty result for failed queries"""
|
| 181 |
+
return {
|
| 182 |
+
"query": query,
|
| 183 |
+
"category": category,
|
| 184 |
+
"query_complexity": "unknown",
|
| 185 |
+
"threshold_used": 0.0,
|
| 186 |
+
"precision_at_k": 0.0,
|
| 187 |
+
"relevant_count": 0,
|
| 188 |
+
"total_results": 0,
|
| 189 |
+
"mrr_score": 0.0,
|
| 190 |
+
"first_relevant_rank": None,
|
| 191 |
+
"relevance_scores": [],
|
| 192 |
+
"timestamp": datetime.now().isoformat()
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
def analyze_all_queries(self, comprehensive_results: List[Dict]) -> List[Dict]:
|
| 196 |
+
"""
|
| 197 |
+
Analyze precision/MRR for all queries in comprehensive evaluation
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
comprehensive_results: Results from latency_evaluator.py
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
List of precision/MRR analysis results
|
| 204 |
+
"""
|
| 205 |
+
print(f"\nπ Analyzing Precision@K and MRR for {len(comprehensive_results)} queries...")
|
| 206 |
+
|
| 207 |
+
analysis_results = []
|
| 208 |
+
|
| 209 |
+
for i, query_data in enumerate(comprehensive_results):
|
| 210 |
+
if not query_data.get('precision_mrr_ready'):
|
| 211 |
+
print(f"βοΈ Skipping query {i+1}: Not ready for precision/MRR analysis")
|
| 212 |
+
continue
|
| 213 |
+
|
| 214 |
+
if not query_data.get('overall_success'):
|
| 215 |
+
print(f"βοΈ Skipping query {i+1}: Pipeline failed")
|
| 216 |
+
analysis_results.append(self._create_empty_precision_mrr_result(
|
| 217 |
+
query_data['query'],
|
| 218 |
+
query_data['category']
|
| 219 |
+
))
|
| 220 |
+
continue
|
| 221 |
+
|
| 222 |
+
# Analyze this query
|
| 223 |
+
result = self.calculate_precision_mrr_single(query_data)
|
| 224 |
+
analysis_results.append(result)
|
| 225 |
+
|
| 226 |
+
print("") # Spacing between queries
|
| 227 |
+
|
| 228 |
+
self.analysis_results = analysis_results
|
| 229 |
+
return analysis_results
|
| 230 |
+
|
| 231 |
+
def calculate_statistics(self) -> Dict[str, Any]:
|
| 232 |
+
"""Calculate comprehensive statistics for metrics 7-8"""
|
| 233 |
+
|
| 234 |
+
if not self.analysis_results:
|
| 235 |
+
return {"error": "No analysis results available"}
|
| 236 |
+
|
| 237 |
+
# Separate by complexity and category
|
| 238 |
+
stats = {
|
| 239 |
+
"overall_statistics": {},
|
| 240 |
+
"by_complexity": {"simple": {}, "complex": {}},
|
| 241 |
+
"by_category": {"diagnosis": {}, "treatment": {}, "mixed": {}},
|
| 242 |
+
"timestamp": datetime.now().isoformat()
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
# Overall statistics
|
| 246 |
+
all_precision = [r['precision_at_k'] for r in self.analysis_results]
|
| 247 |
+
all_mrr = [r['mrr_score'] for r in self.analysis_results]
|
| 248 |
+
|
| 249 |
+
stats["overall_statistics"] = {
|
| 250 |
+
"total_queries": len(self.analysis_results),
|
| 251 |
+
"avg_precision": statistics.mean(all_precision),
|
| 252 |
+
"avg_mrr": statistics.mean(all_mrr),
|
| 253 |
+
"precision_std": statistics.stdev(all_precision) if len(all_precision) > 1 else 0.0,
|
| 254 |
+
"mrr_std": statistics.stdev(all_mrr) if len(all_mrr) > 1 else 0.0
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
# By complexity
|
| 258 |
+
for complexity in ["simple", "complex"]:
|
| 259 |
+
complexity_results = [r for r in self.analysis_results if r['query_complexity'] == complexity]
|
| 260 |
+
if complexity_results:
|
| 261 |
+
precision_scores = [r['precision_at_k'] for r in complexity_results]
|
| 262 |
+
mrr_scores = [r['mrr_score'] for r in complexity_results]
|
| 263 |
+
|
| 264 |
+
stats["by_complexity"][complexity] = {
|
| 265 |
+
"query_count": len(complexity_results),
|
| 266 |
+
"avg_precision": statistics.mean(precision_scores),
|
| 267 |
+
"avg_mrr": statistics.mean(mrr_scores),
|
| 268 |
+
"avg_threshold": statistics.mean([r['threshold_used'] for r in complexity_results])
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# By category
|
| 272 |
+
for category in ["diagnosis", "treatment", "mixed"]:
|
| 273 |
+
category_results = [r for r in self.analysis_results if r['category'] == category]
|
| 274 |
+
if category_results:
|
| 275 |
+
precision_scores = [r['precision_at_k'] for r in category_results]
|
| 276 |
+
mrr_scores = [r['mrr_score'] for r in category_results]
|
| 277 |
+
|
| 278 |
+
stats["by_category"][category] = {
|
| 279 |
+
"query_count": len(category_results),
|
| 280 |
+
"avg_precision": statistics.mean(precision_scores),
|
| 281 |
+
"avg_mrr": statistics.mean(mrr_scores)
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
return stats
|
| 285 |
+
|
| 286 |
+
def save_results(self, filename: str = None) -> str:
|
| 287 |
+
"""Save precision/MRR analysis results"""
|
| 288 |
+
if filename is None:
|
| 289 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 290 |
+
filename = f"precision_mrr_analysis_{timestamp}.json"
|
| 291 |
+
|
| 292 |
+
# Ensure results directory exists
|
| 293 |
+
results_dir = Path(__file__).parent / "results"
|
| 294 |
+
results_dir.mkdir(exist_ok=True)
|
| 295 |
+
|
| 296 |
+
filepath = results_dir / filename
|
| 297 |
+
|
| 298 |
+
# Create output data
|
| 299 |
+
output_data = {
|
| 300 |
+
"analysis_metadata": {
|
| 301 |
+
"total_queries": len(self.analysis_results),
|
| 302 |
+
"analysis_type": "precision_mrr_metrics_7_8",
|
| 303 |
+
"timestamp": datetime.now().isoformat(),
|
| 304 |
+
"adaptive_threshold": True
|
| 305 |
+
},
|
| 306 |
+
"detailed_results": self.analysis_results,
|
| 307 |
+
"statistics": self.calculate_statistics()
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 311 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
| 312 |
+
|
| 313 |
+
print(f"π Precision/MRR analysis saved to: {filepath}")
|
| 314 |
+
return str(filepath)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# Independent execution interface
|
| 318 |
+
if __name__ == "__main__":
|
| 319 |
+
"""Independent precision/MRR analysis interface"""
|
| 320 |
+
|
| 321 |
+
print("π OnCall.ai Precision & MRR Analyzer - Metrics 7-8")
|
| 322 |
+
|
| 323 |
+
if len(sys.argv) > 1:
|
| 324 |
+
comprehensive_file = sys.argv[1]
|
| 325 |
+
else:
|
| 326 |
+
# Look for latest comprehensive_details file
|
| 327 |
+
results_dir = Path(__file__).parent / "results"
|
| 328 |
+
if results_dir.exists():
|
| 329 |
+
comprehensive_files = list(results_dir.glob("comprehensive_details_*.json"))
|
| 330 |
+
if comprehensive_files:
|
| 331 |
+
comprehensive_file = str(sorted(comprehensive_files)[-1]) # Latest file
|
| 332 |
+
print(f"π Using latest comprehensive file: {comprehensive_file}")
|
| 333 |
+
else:
|
| 334 |
+
print("β No comprehensive_details_*.json files found")
|
| 335 |
+
print("Please run latency_evaluator.py first to generate comprehensive data")
|
| 336 |
+
sys.exit(1)
|
| 337 |
+
else:
|
| 338 |
+
print("β Results directory not found")
|
| 339 |
+
sys.exit(1)
|
| 340 |
+
|
| 341 |
+
if not os.path.exists(comprehensive_file):
|
| 342 |
+
print(f"β Comprehensive file not found: {comprehensive_file}")
|
| 343 |
+
print("Usage: python precision_MRR.py [comprehensive_details_file.json]")
|
| 344 |
+
sys.exit(1)
|
| 345 |
+
|
| 346 |
+
# Initialize analyzer
|
| 347 |
+
analyzer = PrecisionMRRAnalyzer()
|
| 348 |
+
|
| 349 |
+
# Load comprehensive data from latency_evaluator.py
|
| 350 |
+
comprehensive_results = analyzer.load_comprehensive_data(comprehensive_file)
|
| 351 |
+
|
| 352 |
+
if not comprehensive_results:
|
| 353 |
+
print("β No comprehensive data loaded")
|
| 354 |
+
sys.exit(1)
|
| 355 |
+
|
| 356 |
+
# Analyze precision/MRR for all queries
|
| 357 |
+
analysis_results = analyzer.analyze_all_queries(comprehensive_results)
|
| 358 |
+
|
| 359 |
+
# Calculate and display statistics
|
| 360 |
+
statistics_result = analyzer.calculate_statistics()
|
| 361 |
+
|
| 362 |
+
print(f"\nπ === PRECISION & MRR ANALYSIS SUMMARY ===")
|
| 363 |
+
|
| 364 |
+
overall_stats = statistics_result['overall_statistics']
|
| 365 |
+
print(f"\nOVERALL METRICS:")
|
| 366 |
+
print(f" Precision@K: {overall_stats['avg_precision']:.3f} (Β±{overall_stats['precision_std']:.3f})")
|
| 367 |
+
print(f" MRR: {overall_stats['avg_mrr']:.3f} (Β±{overall_stats['mrr_std']:.3f})")
|
| 368 |
+
print(f" Total Queries: {overall_stats['total_queries']}")
|
| 369 |
+
|
| 370 |
+
# Complexity-based statistics
|
| 371 |
+
complexity_stats = statistics_result['by_complexity']
|
| 372 |
+
print(f"\nBY COMPLEXITY:")
|
| 373 |
+
for complexity, stats in complexity_stats.items():
|
| 374 |
+
if stats:
|
| 375 |
+
print(f" {complexity.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
| 376 |
+
f"(threshold={stats['avg_threshold']:.2f}, n={stats['query_count']})")
|
| 377 |
+
|
| 378 |
+
# Category-based statistics
|
| 379 |
+
category_stats = statistics_result['by_category']
|
| 380 |
+
print(f"\nBY CATEGORY:")
|
| 381 |
+
for category, stats in category_stats.items():
|
| 382 |
+
if stats:
|
| 383 |
+
print(f" {category.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
| 384 |
+
f"(n={stats['query_count']})")
|
| 385 |
+
|
| 386 |
+
# Save results
|
| 387 |
+
saved_path = analyzer.save_results()
|
| 388 |
+
|
| 389 |
+
print(f"\nβ
Precision & MRR analysis complete!")
|
| 390 |
+
print(f"π Results saved to: {saved_path}")
|
| 391 |
+
print(f"\nπ‘ Next step: Create precision_mrr_chart_generator.py for visualization")
|
evaluation/{evaluation_instruction.md β old/evaluation_instruction.md}
RENAMED
|
File without changes
|
evaluation/{evaluation_instruction_customization.md β old/evaluation_instruction_customization.md}
RENAMED
|
File without changes
|
evaluation/precision_mrr_chart_generator.py
ADDED
|
@@ -0,0 +1,586 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OnCall.ai System - Precision & MRR Chart Generator (Metrics 7-8)
|
| 4 |
+
===============================================================
|
| 5 |
+
|
| 6 |
+
Generates comprehensive Precision@K and MRR analysis charts from saved analysis results.
|
| 7 |
+
Reads JSON files produced by metric7_8_precision_MRR.py and creates visualizations.
|
| 8 |
+
|
| 9 |
+
Charts generated:
|
| 10 |
+
1. Precision@K comparison by category and complexity
|
| 11 |
+
2. MRR comparison by category and complexity
|
| 12 |
+
3. Combined metrics heatmap
|
| 13 |
+
4. Threshold impact analysis
|
| 14 |
+
5. Detailed statistics tables
|
| 15 |
+
|
| 16 |
+
No LLM calls - pure data visualization.
|
| 17 |
+
|
| 18 |
+
Author: YanBo Chen
|
| 19 |
+
Date: 2025-08-04
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import sys
|
| 25 |
+
from typing import Dict, List, Any
|
| 26 |
+
from datetime import datetime
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
import glob
|
| 29 |
+
|
| 30 |
+
# Visualization imports
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
import seaborn as sns
|
| 33 |
+
import pandas as pd
|
| 34 |
+
import numpy as np
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class PrecisionMRRChartGenerator:
|
| 38 |
+
"""Generate charts from precision/MRR analysis results - no LLM dependency"""
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
"""Initialize chart generator"""
|
| 42 |
+
print("π Initializing Precision & MRR Chart Generator...")
|
| 43 |
+
|
| 44 |
+
# Set up professional chart style
|
| 45 |
+
plt.style.use('default')
|
| 46 |
+
sns.set_palette("husl")
|
| 47 |
+
|
| 48 |
+
print("β
Chart Generator ready")
|
| 49 |
+
|
| 50 |
+
def load_latest_analysis(self, results_dir: str = None) -> Dict[str, Any]:
|
| 51 |
+
"""
|
| 52 |
+
Load the most recent precision/MRR analysis file
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
results_dir: Directory containing analysis files
|
| 56 |
+
"""
|
| 57 |
+
if results_dir is None:
|
| 58 |
+
results_dir = Path(__file__).parent / "results"
|
| 59 |
+
|
| 60 |
+
analysis_files = glob.glob(str(results_dir / "precision_mrr_analysis_*.json"))
|
| 61 |
+
|
| 62 |
+
if not analysis_files:
|
| 63 |
+
raise FileNotFoundError("No precision_mrr_analysis_*.json files found. Run metric7_8_precision_MRR.py first.")
|
| 64 |
+
|
| 65 |
+
latest_file = max(analysis_files, key=os.path.getctime)
|
| 66 |
+
print(f"π Loading latest analysis: {latest_file}")
|
| 67 |
+
|
| 68 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
| 69 |
+
return json.load(f)
|
| 70 |
+
|
| 71 |
+
def create_precision_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 72 |
+
"""Create Precision@K comparison chart"""
|
| 73 |
+
|
| 74 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 75 |
+
|
| 76 |
+
# Chart 1: Precision by Category
|
| 77 |
+
category_stats = analysis_data['statistics']['by_category']
|
| 78 |
+
categories = []
|
| 79 |
+
precisions = []
|
| 80 |
+
|
| 81 |
+
for category, stats in category_stats.items():
|
| 82 |
+
if stats:
|
| 83 |
+
categories.append(category.title())
|
| 84 |
+
precisions.append(stats['avg_precision'])
|
| 85 |
+
|
| 86 |
+
if categories:
|
| 87 |
+
bars1 = ax1.bar(categories, precisions, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728'])
|
| 88 |
+
ax1.set_title('Precision@K by Query Category', fontweight='bold')
|
| 89 |
+
ax1.set_ylabel('Precision@K')
|
| 90 |
+
ax1.set_xlabel('Query Category')
|
| 91 |
+
ax1.set_ylim(0, 1.0)
|
| 92 |
+
ax1.grid(True, alpha=0.3)
|
| 93 |
+
|
| 94 |
+
# Add value labels
|
| 95 |
+
for bar, precision in zip(bars1, precisions):
|
| 96 |
+
height = bar.get_height()
|
| 97 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 98 |
+
f'{precision:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 99 |
+
|
| 100 |
+
# Chart 2: Precision by Complexity
|
| 101 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 102 |
+
complexities = []
|
| 103 |
+
comp_precisions = []
|
| 104 |
+
|
| 105 |
+
for complexity, stats in complexity_stats.items():
|
| 106 |
+
if stats:
|
| 107 |
+
complexities.append(complexity.title())
|
| 108 |
+
comp_precisions.append(stats['avg_precision'])
|
| 109 |
+
|
| 110 |
+
if complexities:
|
| 111 |
+
bars2 = ax2.bar(complexities, comp_precisions, alpha=0.8, color=['#2ca02c', '#d62728'])
|
| 112 |
+
ax2.set_title('Precision@K by Query Complexity', fontweight='bold')
|
| 113 |
+
ax2.set_ylabel('Precision@K')
|
| 114 |
+
ax2.set_xlabel('Query Complexity')
|
| 115 |
+
ax2.set_ylim(0, 1.0)
|
| 116 |
+
ax2.grid(True, alpha=0.3)
|
| 117 |
+
|
| 118 |
+
# Add value labels and threshold info
|
| 119 |
+
for bar, precision, complexity in zip(bars2, comp_precisions, complexities):
|
| 120 |
+
height = bar.get_height()
|
| 121 |
+
threshold = 0.15 if complexity.lower() == 'complex' else 0.25
|
| 122 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 123 |
+
f'{precision:.3f}\n(T={threshold})', ha='center', va='bottom',
|
| 124 |
+
fontweight='bold', fontsize=9)
|
| 125 |
+
|
| 126 |
+
plt.tight_layout()
|
| 127 |
+
|
| 128 |
+
# Save chart
|
| 129 |
+
if save_path is None:
|
| 130 |
+
save_path = Path(__file__).parent / "charts" / f"precision_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 131 |
+
|
| 132 |
+
save_path = Path(save_path)
|
| 133 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 134 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 135 |
+
plt.close()
|
| 136 |
+
|
| 137 |
+
print(f"π Precision comparison chart saved: {save_path}")
|
| 138 |
+
return str(save_path)
|
| 139 |
+
|
| 140 |
+
def create_mrr_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 141 |
+
"""Create MRR comparison chart"""
|
| 142 |
+
|
| 143 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 144 |
+
|
| 145 |
+
# Chart 1: MRR by Category
|
| 146 |
+
category_stats = analysis_data['statistics']['by_category']
|
| 147 |
+
categories = []
|
| 148 |
+
mrr_scores = []
|
| 149 |
+
|
| 150 |
+
for category, stats in category_stats.items():
|
| 151 |
+
if stats:
|
| 152 |
+
categories.append(category.title())
|
| 153 |
+
mrr_scores.append(stats['avg_mrr'])
|
| 154 |
+
|
| 155 |
+
if categories:
|
| 156 |
+
bars1 = ax1.bar(categories, mrr_scores, alpha=0.8, color=['#9467bd', '#8c564b', '#e377c2'])
|
| 157 |
+
ax1.set_title('Mean Reciprocal Rank by Query Category', fontweight='bold')
|
| 158 |
+
ax1.set_ylabel('MRR Score')
|
| 159 |
+
ax1.set_xlabel('Query Category')
|
| 160 |
+
ax1.set_ylim(0, 1.0)
|
| 161 |
+
ax1.grid(True, alpha=0.3)
|
| 162 |
+
|
| 163 |
+
# Add value labels
|
| 164 |
+
for bar, mrr in zip(bars1, mrr_scores):
|
| 165 |
+
height = bar.get_height()
|
| 166 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 167 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 168 |
+
|
| 169 |
+
# Chart 2: MRR by Complexity
|
| 170 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 171 |
+
complexities = []
|
| 172 |
+
comp_mrr = []
|
| 173 |
+
|
| 174 |
+
for complexity, stats in complexity_stats.items():
|
| 175 |
+
if stats:
|
| 176 |
+
complexities.append(complexity.title())
|
| 177 |
+
comp_mrr.append(stats['avg_mrr'])
|
| 178 |
+
|
| 179 |
+
if complexities:
|
| 180 |
+
bars2 = ax2.bar(complexities, comp_mrr, alpha=0.8, color=['#17becf', '#bcbd22'])
|
| 181 |
+
ax2.set_title('MRR by Query Complexity', fontweight='bold')
|
| 182 |
+
ax2.set_ylabel('MRR Score')
|
| 183 |
+
ax2.set_xlabel('Query Complexity')
|
| 184 |
+
ax2.set_ylim(0, 1.0)
|
| 185 |
+
ax2.grid(True, alpha=0.3)
|
| 186 |
+
|
| 187 |
+
# Add value labels
|
| 188 |
+
for bar, mrr in zip(bars2, comp_mrr):
|
| 189 |
+
height = bar.get_height()
|
| 190 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 191 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
| 192 |
+
|
| 193 |
+
plt.tight_layout()
|
| 194 |
+
|
| 195 |
+
# Save chart
|
| 196 |
+
if save_path is None:
|
| 197 |
+
save_path = Path(__file__).parent / "charts" / f"mrr_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 198 |
+
|
| 199 |
+
save_path = Path(save_path)
|
| 200 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 201 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 202 |
+
plt.close()
|
| 203 |
+
|
| 204 |
+
print(f"π MRR comparison chart saved: {save_path}")
|
| 205 |
+
return str(save_path)
|
| 206 |
+
|
| 207 |
+
def create_combined_metrics_heatmap(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 208 |
+
"""Create combined precision/MRR heatmap"""
|
| 209 |
+
|
| 210 |
+
# Prepare data for heatmap
|
| 211 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
| 212 |
+
|
| 213 |
+
if not detailed_results:
|
| 214 |
+
print("β οΈ No detailed results for heatmap")
|
| 215 |
+
return ""
|
| 216 |
+
|
| 217 |
+
# Create DataFrame for heatmap
|
| 218 |
+
heatmap_data = []
|
| 219 |
+
for result in detailed_results:
|
| 220 |
+
heatmap_data.append({
|
| 221 |
+
'Category': result['category'].title(),
|
| 222 |
+
'Complexity': result['query_complexity'].title(),
|
| 223 |
+
'Precision@K': result['precision_at_k'],
|
| 224 |
+
'MRR': result['mrr_score'],
|
| 225 |
+
'Threshold': result['threshold_used']
|
| 226 |
+
})
|
| 227 |
+
|
| 228 |
+
df = pd.DataFrame(heatmap_data)
|
| 229 |
+
|
| 230 |
+
# Create pivot table for heatmap
|
| 231 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
| 232 |
+
|
| 233 |
+
# Precision heatmap
|
| 234 |
+
precision_pivot = df.pivot_table(values='Precision@K', index='Category', columns='Complexity', aggfunc='mean')
|
| 235 |
+
sns.heatmap(precision_pivot, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax1,
|
| 236 |
+
cbar_kws={'label': 'Precision@K'}, vmin=0, vmax=1)
|
| 237 |
+
ax1.set_title('Precision@K Heatmap\n(Category vs Complexity)', fontweight='bold')
|
| 238 |
+
|
| 239 |
+
# MRR heatmap
|
| 240 |
+
mrr_pivot = df.pivot_table(values='MRR', index='Category', columns='Complexity', aggfunc='mean')
|
| 241 |
+
sns.heatmap(mrr_pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax2,
|
| 242 |
+
cbar_kws={'label': 'MRR Score'}, vmin=0, vmax=1)
|
| 243 |
+
ax2.set_title('MRR Heatmap\n(Category vs Complexity)', fontweight='bold')
|
| 244 |
+
|
| 245 |
+
plt.tight_layout()
|
| 246 |
+
|
| 247 |
+
# Save chart
|
| 248 |
+
if save_path is None:
|
| 249 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_heatmap_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 250 |
+
|
| 251 |
+
save_path = Path(save_path)
|
| 252 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 253 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 254 |
+
plt.close()
|
| 255 |
+
|
| 256 |
+
print(f"π Combined metrics heatmap saved: {save_path}")
|
| 257 |
+
return str(save_path)
|
| 258 |
+
|
| 259 |
+
def create_threshold_impact_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 260 |
+
"""Create threshold impact analysis chart"""
|
| 261 |
+
|
| 262 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
| 263 |
+
|
| 264 |
+
if not detailed_results:
|
| 265 |
+
print("β οΈ No detailed results for threshold analysis")
|
| 266 |
+
return ""
|
| 267 |
+
|
| 268 |
+
# Group by complexity and calculate average relevance
|
| 269 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 270 |
+
|
| 271 |
+
# Prepare data
|
| 272 |
+
simple_queries = [r for r in detailed_results if r['query_complexity'] == 'simple']
|
| 273 |
+
complex_queries = [r for r in detailed_results if r['query_complexity'] == 'complex']
|
| 274 |
+
|
| 275 |
+
# Chart 1: Relevance distribution for different complexities
|
| 276 |
+
if simple_queries:
|
| 277 |
+
simple_relevances = []
|
| 278 |
+
for query in simple_queries:
|
| 279 |
+
simple_relevances.extend(query.get('relevance_scores', []))
|
| 280 |
+
|
| 281 |
+
ax1.hist(simple_relevances, bins=10, alpha=0.7, label=f'Simple (T=0.25)', color='#2ca02c', density=True)
|
| 282 |
+
ax1.axvline(x=0.25, color='#2ca02c', linestyle='--', linewidth=2, label='Simple Threshold')
|
| 283 |
+
|
| 284 |
+
if complex_queries:
|
| 285 |
+
complex_relevances = []
|
| 286 |
+
for query in complex_queries:
|
| 287 |
+
complex_relevances.extend(query.get('relevance_scores', []))
|
| 288 |
+
|
| 289 |
+
ax1.hist(complex_relevances, bins=10, alpha=0.7, label=f'Complex (T=0.15)', color='#d62728', density=True)
|
| 290 |
+
ax1.axvline(x=0.15, color='#d62728', linestyle='--', linewidth=2, label='Complex Threshold')
|
| 291 |
+
|
| 292 |
+
ax1.set_title('Relevance Score Distribution\nby Query Complexity', fontweight='bold')
|
| 293 |
+
ax1.set_xlabel('Relevance Score')
|
| 294 |
+
ax1.set_ylabel('Density')
|
| 295 |
+
ax1.legend()
|
| 296 |
+
ax1.grid(True, alpha=0.3)
|
| 297 |
+
|
| 298 |
+
# Chart 2: Metrics comparison
|
| 299 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 300 |
+
|
| 301 |
+
complexities = []
|
| 302 |
+
precisions = []
|
| 303 |
+
mrrs = []
|
| 304 |
+
thresholds = []
|
| 305 |
+
|
| 306 |
+
for complexity, stats in complexity_stats.items():
|
| 307 |
+
if stats:
|
| 308 |
+
complexities.append(complexity.title())
|
| 309 |
+
precisions.append(stats['avg_precision'])
|
| 310 |
+
mrrs.append(stats['avg_mrr'])
|
| 311 |
+
thresholds.append(stats['avg_threshold'])
|
| 312 |
+
|
| 313 |
+
x = np.arange(len(complexities))
|
| 314 |
+
width = 0.35
|
| 315 |
+
|
| 316 |
+
bars1 = ax2.bar(x - width/2, precisions, width, label='Precision@K', alpha=0.8, color='#ff7f0e')
|
| 317 |
+
bars2 = ax2.bar(x + width/2, mrrs, width, label='MRR', alpha=0.8, color='#1f77b4')
|
| 318 |
+
|
| 319 |
+
ax2.set_title('Metrics Comparison by Complexity\n(with Adaptive Thresholds)', fontweight='bold')
|
| 320 |
+
ax2.set_ylabel('Score')
|
| 321 |
+
ax2.set_xlabel('Query Complexity')
|
| 322 |
+
ax2.set_xticks(x)
|
| 323 |
+
ax2.set_xticklabels(complexities)
|
| 324 |
+
ax2.legend()
|
| 325 |
+
ax2.grid(True, alpha=0.3)
|
| 326 |
+
ax2.set_ylim(0, 1.0)
|
| 327 |
+
|
| 328 |
+
# Add value labels
|
| 329 |
+
for bars, values, thresholds_vals in [(bars1, precisions, thresholds), (bars2, mrrs, thresholds)]:
|
| 330 |
+
for bar, value, threshold in zip(bars, values, thresholds_vals):
|
| 331 |
+
height = bar.get_height()
|
| 332 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 333 |
+
f'{value:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
|
| 334 |
+
|
| 335 |
+
plt.tight_layout()
|
| 336 |
+
|
| 337 |
+
# Save chart
|
| 338 |
+
if save_path is None:
|
| 339 |
+
save_path = Path(__file__).parent / "charts" / f"threshold_impact_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 340 |
+
|
| 341 |
+
save_path = Path(save_path)
|
| 342 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 343 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 344 |
+
plt.close()
|
| 345 |
+
|
| 346 |
+
print(f"π Threshold impact chart saved: {save_path}")
|
| 347 |
+
return str(save_path)
|
| 348 |
+
|
| 349 |
+
def create_detailed_analysis_table(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 350 |
+
"""Create detailed statistics table"""
|
| 351 |
+
|
| 352 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
| 353 |
+
ax.axis('tight')
|
| 354 |
+
ax.axis('off')
|
| 355 |
+
|
| 356 |
+
# Prepare table data
|
| 357 |
+
table_data = []
|
| 358 |
+
|
| 359 |
+
# Overall statistics
|
| 360 |
+
overall_stats = analysis_data['statistics']['overall_statistics']
|
| 361 |
+
table_data.append(['OVERALL METRICS', '', '', '', ''])
|
| 362 |
+
table_data.append(['Total Queries', str(overall_stats['total_queries']), '', '', ''])
|
| 363 |
+
table_data.append(['Avg Precision@K', f"{overall_stats['avg_precision']:.3f}",
|
| 364 |
+
f"Β±{overall_stats['precision_std']:.3f}", '', ''])
|
| 365 |
+
table_data.append(['Avg MRR', f"{overall_stats['avg_mrr']:.3f}",
|
| 366 |
+
f"Β±{overall_stats['mrr_std']:.3f}", '', ''])
|
| 367 |
+
table_data.append(['', '', '', '', ''])
|
| 368 |
+
|
| 369 |
+
# By category
|
| 370 |
+
table_data.append(['BY CATEGORY', 'Queries', 'Precision@K', 'MRR', 'Notes'])
|
| 371 |
+
category_stats = analysis_data['statistics']['by_category']
|
| 372 |
+
for category, stats in category_stats.items():
|
| 373 |
+
if stats:
|
| 374 |
+
table_data.append([
|
| 375 |
+
category.title(),
|
| 376 |
+
str(stats['query_count']),
|
| 377 |
+
f"{stats['avg_precision']:.3f}",
|
| 378 |
+
f"{stats['avg_mrr']:.3f}",
|
| 379 |
+
''
|
| 380 |
+
])
|
| 381 |
+
|
| 382 |
+
table_data.append(['', '', '', '', ''])
|
| 383 |
+
|
| 384 |
+
# By complexity
|
| 385 |
+
table_data.append(['BY COMPLEXITY', 'Queries', 'Precision@K', 'MRR', 'Threshold'])
|
| 386 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
| 387 |
+
for complexity, stats in complexity_stats.items():
|
| 388 |
+
if stats:
|
| 389 |
+
table_data.append([
|
| 390 |
+
complexity.title(),
|
| 391 |
+
str(stats['query_count']),
|
| 392 |
+
f"{stats['avg_precision']:.3f}",
|
| 393 |
+
f"{stats['avg_mrr']:.3f}",
|
| 394 |
+
f"{stats['avg_threshold']:.2f}"
|
| 395 |
+
])
|
| 396 |
+
|
| 397 |
+
# Create table
|
| 398 |
+
table = ax.table(cellText=table_data,
|
| 399 |
+
colLabels=['Metric', 'Value 1', 'Value 2', 'Value 3', 'Value 4'],
|
| 400 |
+
cellLoc='center',
|
| 401 |
+
loc='center',
|
| 402 |
+
bbox=[0, 0, 1, 1])
|
| 403 |
+
|
| 404 |
+
# Style the table
|
| 405 |
+
table.auto_set_font_size(False)
|
| 406 |
+
table.set_fontsize(10)
|
| 407 |
+
table.scale(1, 2)
|
| 408 |
+
|
| 409 |
+
# Header styling
|
| 410 |
+
for i in range(5):
|
| 411 |
+
table[(0, i)].set_facecolor('#40466e')
|
| 412 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
| 413 |
+
|
| 414 |
+
# Section headers styling
|
| 415 |
+
for i, row in enumerate(table_data):
|
| 416 |
+
if row[0] in ['OVERALL METRICS', 'BY CATEGORY', 'BY COMPLEXITY']:
|
| 417 |
+
table[(i+1, 0)].set_facecolor('#1f77b4')
|
| 418 |
+
table[(i+1, 0)].set_text_props(weight='bold', color='white')
|
| 419 |
+
|
| 420 |
+
plt.title('Precision@K & MRR Detailed Analysis\nMetrics 7-8 Statistics',
|
| 421 |
+
fontweight='bold', fontsize=14, pad=20)
|
| 422 |
+
|
| 423 |
+
# Save chart
|
| 424 |
+
if save_path is None:
|
| 425 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_table_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 426 |
+
|
| 427 |
+
save_path = Path(save_path)
|
| 428 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 429 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 430 |
+
plt.close()
|
| 431 |
+
|
| 432 |
+
print(f"π Detailed analysis table saved: {save_path}")
|
| 433 |
+
return str(save_path)
|
| 434 |
+
|
| 435 |
+
def create_individual_query_analysis(self, analysis_data: Dict, save_path: str = None) -> str:
|
| 436 |
+
"""Create individual query analysis chart"""
|
| 437 |
+
|
| 438 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
| 439 |
+
|
| 440 |
+
if not detailed_results:
|
| 441 |
+
print("β οΈ No detailed results for individual analysis")
|
| 442 |
+
return ""
|
| 443 |
+
|
| 444 |
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
|
| 445 |
+
|
| 446 |
+
# Prepare data
|
| 447 |
+
query_indices = []
|
| 448 |
+
precisions = []
|
| 449 |
+
mrrs = []
|
| 450 |
+
colors = []
|
| 451 |
+
labels = []
|
| 452 |
+
|
| 453 |
+
for i, result in enumerate(detailed_results):
|
| 454 |
+
query_indices.append(i + 1)
|
| 455 |
+
precisions.append(result['precision_at_k'])
|
| 456 |
+
mrrs.append(result['mrr_score'])
|
| 457 |
+
|
| 458 |
+
# Color by complexity
|
| 459 |
+
if result['query_complexity'] == 'complex':
|
| 460 |
+
colors.append('#d62728') # Red for complex
|
| 461 |
+
else:
|
| 462 |
+
colors.append('#2ca02c') # Green for simple
|
| 463 |
+
|
| 464 |
+
# Create short label
|
| 465 |
+
query_short = result['query'][:30] + "..." if len(result['query']) > 30 else result['query']
|
| 466 |
+
category = result['category'][:4].upper()
|
| 467 |
+
labels.append(f"{category}\n{query_short}")
|
| 468 |
+
|
| 469 |
+
# Chart 1: Precision@K for each query
|
| 470 |
+
bars1 = ax1.bar(query_indices, precisions, color=colors, alpha=0.8)
|
| 471 |
+
ax1.set_title('Precision@K by Individual Query', fontweight='bold')
|
| 472 |
+
ax1.set_ylabel('Precision@K')
|
| 473 |
+
ax1.set_xlabel('Query Index')
|
| 474 |
+
ax1.set_ylim(0, 1.0)
|
| 475 |
+
ax1.grid(True, alpha=0.3)
|
| 476 |
+
|
| 477 |
+
# Add value labels
|
| 478 |
+
for bar, precision in zip(bars1, precisions):
|
| 479 |
+
height = bar.get_height()
|
| 480 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 481 |
+
f'{precision:.2f}', ha='center', va='bottom', fontsize=8)
|
| 482 |
+
|
| 483 |
+
# Chart 2: MRR for each query
|
| 484 |
+
bars2 = ax2.bar(query_indices, mrrs, color=colors, alpha=0.8)
|
| 485 |
+
ax2.set_title('MRR by Individual Query', fontweight='bold')
|
| 486 |
+
ax2.set_ylabel('MRR Score')
|
| 487 |
+
ax2.set_xlabel('Query Index')
|
| 488 |
+
ax2.set_ylim(0, 1.0)
|
| 489 |
+
ax2.grid(True, alpha=0.3)
|
| 490 |
+
|
| 491 |
+
# Add value labels
|
| 492 |
+
for bar, mrr in zip(bars2, mrrs):
|
| 493 |
+
height = bar.get_height()
|
| 494 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
| 495 |
+
f'{mrr:.2f}', ha='center', va='bottom', fontsize=8)
|
| 496 |
+
|
| 497 |
+
# Add legend
|
| 498 |
+
from matplotlib.patches import Patch
|
| 499 |
+
legend_elements = [
|
| 500 |
+
Patch(facecolor='#2ca02c', alpha=0.8, label='Simple Query (T=0.25)'),
|
| 501 |
+
Patch(facecolor='#d62728', alpha=0.8, label='Complex Query (T=0.15)')
|
| 502 |
+
]
|
| 503 |
+
ax1.legend(handles=legend_elements, loc='upper right')
|
| 504 |
+
|
| 505 |
+
plt.tight_layout()
|
| 506 |
+
|
| 507 |
+
# Save chart
|
| 508 |
+
if save_path is None:
|
| 509 |
+
save_path = Path(__file__).parent / "charts" / f"individual_query_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
| 510 |
+
|
| 511 |
+
save_path = Path(save_path)
|
| 512 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 513 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
| 514 |
+
plt.close()
|
| 515 |
+
|
| 516 |
+
print(f"π Individual query analysis saved: {save_path}")
|
| 517 |
+
return str(save_path)
|
| 518 |
+
|
| 519 |
+
def generate_all_charts(self, analysis_data: Dict = None) -> Dict[str, str]:
|
| 520 |
+
"""Generate all precision/MRR charts"""
|
| 521 |
+
|
| 522 |
+
if analysis_data is None:
|
| 523 |
+
analysis_data = self.load_latest_analysis()
|
| 524 |
+
|
| 525 |
+
print(f"\nπ Generating all Precision & MRR charts...")
|
| 526 |
+
|
| 527 |
+
saved_charts = {}
|
| 528 |
+
|
| 529 |
+
# Generate all chart types
|
| 530 |
+
try:
|
| 531 |
+
saved_charts['precision_comparison'] = self.create_precision_comparison_chart(analysis_data)
|
| 532 |
+
saved_charts['mrr_comparison'] = self.create_mrr_comparison_chart(analysis_data)
|
| 533 |
+
saved_charts['combined_heatmap'] = self.create_combined_metrics_heatmap(analysis_data)
|
| 534 |
+
saved_charts['threshold_impact'] = self.create_threshold_impact_chart(analysis_data)
|
| 535 |
+
saved_charts['individual_analysis'] = self.create_individual_query_analysis(analysis_data)
|
| 536 |
+
|
| 537 |
+
except Exception as e:
|
| 538 |
+
print(f"β Error generating charts: {e}")
|
| 539 |
+
return {"error": str(e)}
|
| 540 |
+
|
| 541 |
+
print(f"\nβ
All precision/MRR charts generated successfully!")
|
| 542 |
+
print(f"π Charts saved to: evaluation/charts/")
|
| 543 |
+
|
| 544 |
+
return saved_charts
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
# Independent execution interface
|
| 548 |
+
if __name__ == "__main__":
|
| 549 |
+
"""Generate precision/MRR charts from analysis results"""
|
| 550 |
+
|
| 551 |
+
print("π OnCall.ai Precision & MRR Chart Generator - Metrics 7-8")
|
| 552 |
+
|
| 553 |
+
if len(sys.argv) > 1:
|
| 554 |
+
analysis_file = sys.argv[1]
|
| 555 |
+
|
| 556 |
+
if not os.path.exists(analysis_file):
|
| 557 |
+
print(f"β Analysis file not found: {analysis_file}")
|
| 558 |
+
sys.exit(1)
|
| 559 |
+
else:
|
| 560 |
+
analysis_file = None # Will use latest file
|
| 561 |
+
|
| 562 |
+
# Initialize generator
|
| 563 |
+
generator = PrecisionMRRChartGenerator()
|
| 564 |
+
|
| 565 |
+
try:
|
| 566 |
+
# Load analysis data
|
| 567 |
+
if analysis_file:
|
| 568 |
+
with open(analysis_file, 'r', encoding='utf-8') as f:
|
| 569 |
+
analysis_data = json.load(f)
|
| 570 |
+
print(f"π Using specified analysis file: {analysis_file}")
|
| 571 |
+
else:
|
| 572 |
+
analysis_data = generator.load_latest_analysis()
|
| 573 |
+
|
| 574 |
+
# Generate all charts
|
| 575 |
+
saved_charts = generator.generate_all_charts(analysis_data)
|
| 576 |
+
|
| 577 |
+
if 'error' not in saved_charts:
|
| 578 |
+
print(f"\nπ === PRECISION & MRR CHART GENERATION SUMMARY ===")
|
| 579 |
+
for chart_type, filepath in saved_charts.items():
|
| 580 |
+
print(f" π {chart_type.replace('_', ' ').title()}: {filepath}")
|
| 581 |
+
|
| 582 |
+
print(f"\nπ‘ Charts ready for analysis and presentation!")
|
| 583 |
+
|
| 584 |
+
except Exception as e:
|
| 585 |
+
print(f"β Chart generation failed: {e}")
|
| 586 |
+
sys.exit(1)
|
evaluation/single_test_query.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
|