Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
b0f56ec
1
Parent(s):
40d39ed
Refactor relevance calculation and update thresholds in latency evaluator; enhance precision and MRR analyzer with angular distance metrics; increase timeout for primary generation in fallback configuration.
Browse files- evaluation/latency_evaluator.py +19 -21
- evaluation/metric7_8_precision_MRR.py +9 -2
- src/generation.py +2 -2
evaluation/latency_evaluator.py
CHANGED
|
@@ -273,27 +273,25 @@ class ComprehensiveEvaluator:
|
|
| 273 |
|
| 274 |
# METRIC 3: Retrieval Relevance Analysis
|
| 275 |
if processed_results:
|
| 276 |
-
|
| 277 |
for doc_result in processed_results:
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
)
|
| 283 |
-
similarity_scores.append(similarity)
|
| 284 |
|
| 285 |
-
average_relevance = sum(
|
| 286 |
-
high_relevance_count = sum(1 for score in
|
| 287 |
|
| 288 |
relevance_metrics = {
|
| 289 |
"average_relevance": average_relevance,
|
| 290 |
-
"max_relevance": max(
|
| 291 |
-
"min_relevance": min(
|
| 292 |
-
"
|
| 293 |
"high_relevance_count": high_relevance_count,
|
| 294 |
-
"high_relevance_ratio": high_relevance_count / len(
|
| 295 |
"retrieved_count": len(processed_results),
|
| 296 |
-
"meets_threshold": average_relevance >= 0.
|
| 297 |
"retrieval_time": step3_time
|
| 298 |
}
|
| 299 |
else:
|
|
@@ -322,7 +320,7 @@ class ComprehensiveEvaluator:
|
|
| 322 |
"latency_metrics": {
|
| 323 |
"total_latency": total_time,
|
| 324 |
"timing_details": timing_details,
|
| 325 |
-
"meets_target": total_time <=
|
| 326 |
},
|
| 327 |
|
| 328 |
# Metric 2: Condition Extraction - Success rate from user_prompt.py
|
|
@@ -411,7 +409,7 @@ class ComprehensiveEvaluator:
|
|
| 411 |
"latency_metrics": {
|
| 412 |
"total_latency": total_time,
|
| 413 |
"timing_details": timing_details,
|
| 414 |
-
"meets_target": total_time <=
|
| 415 |
},
|
| 416 |
|
| 417 |
# Metric 2: Condition Extraction - Partial data may be available before failure
|
|
@@ -546,7 +544,7 @@ class ComprehensiveEvaluator:
|
|
| 546 |
"min_latency": min(latencies),
|
| 547 |
"max_latency": max(latencies),
|
| 548 |
"query_count": len(latencies),
|
| 549 |
-
"target_compliance": sum(1 for lat in latencies if lat <=
|
| 550 |
"individual_latencies": latencies
|
| 551 |
}
|
| 552 |
else:
|
|
@@ -661,7 +659,7 @@ class ComprehensiveEvaluator:
|
|
| 661 |
"max_latency": max(latencies),
|
| 662 |
"successful_queries": len(all_successful_results),
|
| 663 |
"total_queries": total_queries,
|
| 664 |
-
"target_compliance": sum(1 for lat in latencies if lat <=
|
| 665 |
}
|
| 666 |
|
| 667 |
elif metric_name == "extraction":
|
|
@@ -682,8 +680,8 @@ class ComprehensiveEvaluator:
|
|
| 682 |
"min_relevance": min(relevance_scores),
|
| 683 |
"successful_queries": len(all_successful_results),
|
| 684 |
"total_queries": total_queries,
|
| 685 |
-
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.
|
| 686 |
-
"target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.
|
| 687 |
}
|
| 688 |
|
| 689 |
elif metric_name == "coverage" and all_successful_results:
|
|
@@ -866,7 +864,7 @@ if __name__ == "__main__":
|
|
| 866 |
|
| 867 |
if metric_name == "latency":
|
| 868 |
print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
| 869 |
-
print(f"
|
| 870 |
|
| 871 |
elif metric_name == "extraction":
|
| 872 |
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
|
|
|
| 273 |
|
| 274 |
# METRIC 3: Retrieval Relevance Analysis
|
| 275 |
if processed_results:
|
| 276 |
+
relevance_scores = []
|
| 277 |
for doc_result in processed_results:
|
| 278 |
+
# Get angular distance and convert to relevance using correct formula
|
| 279 |
+
distance = doc_result.get('distance', 1.0)
|
| 280 |
+
relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
|
| 281 |
+
relevance_scores.append(relevance)
|
|
|
|
|
|
|
| 282 |
|
| 283 |
+
average_relevance = sum(relevance_scores) / len(relevance_scores)
|
| 284 |
+
high_relevance_count = sum(1 for score in relevance_scores if score >= 0.85)
|
| 285 |
|
| 286 |
relevance_metrics = {
|
| 287 |
"average_relevance": average_relevance,
|
| 288 |
+
"max_relevance": max(relevance_scores),
|
| 289 |
+
"min_relevance": min(relevance_scores),
|
| 290 |
+
"relevance_scores": relevance_scores,
|
| 291 |
"high_relevance_count": high_relevance_count,
|
| 292 |
+
"high_relevance_ratio": high_relevance_count / len(relevance_scores),
|
| 293 |
"retrieved_count": len(processed_results),
|
| 294 |
+
"meets_threshold": average_relevance >= 0.85,
|
| 295 |
"retrieval_time": step3_time
|
| 296 |
}
|
| 297 |
else:
|
|
|
|
| 320 |
"latency_metrics": {
|
| 321 |
"total_latency": total_time,
|
| 322 |
"timing_details": timing_details,
|
| 323 |
+
"meets_target": total_time <= 60.0
|
| 324 |
},
|
| 325 |
|
| 326 |
# Metric 2: Condition Extraction - Success rate from user_prompt.py
|
|
|
|
| 409 |
"latency_metrics": {
|
| 410 |
"total_latency": total_time,
|
| 411 |
"timing_details": timing_details,
|
| 412 |
+
"meets_target": total_time <= 60.0
|
| 413 |
},
|
| 414 |
|
| 415 |
# Metric 2: Condition Extraction - Partial data may be available before failure
|
|
|
|
| 544 |
"min_latency": min(latencies),
|
| 545 |
"max_latency": max(latencies),
|
| 546 |
"query_count": len(latencies),
|
| 547 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies),
|
| 548 |
"individual_latencies": latencies
|
| 549 |
}
|
| 550 |
else:
|
|
|
|
| 659 |
"max_latency": max(latencies),
|
| 660 |
"successful_queries": len(all_successful_results),
|
| 661 |
"total_queries": total_queries,
|
| 662 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
|
| 663 |
}
|
| 664 |
|
| 665 |
elif metric_name == "extraction":
|
|
|
|
| 680 |
"min_relevance": min(relevance_scores),
|
| 681 |
"successful_queries": len(all_successful_results),
|
| 682 |
"total_queries": total_queries,
|
| 683 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
|
| 684 |
+
"target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.7
|
| 685 |
}
|
| 686 |
|
| 687 |
elif metric_name == "coverage" and all_successful_results:
|
|
|
|
| 864 |
|
| 865 |
if metric_name == "latency":
|
| 866 |
print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
| 867 |
+
print(f" 60s Target: {'✅ Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
|
| 868 |
|
| 869 |
elif metric_name == "extraction":
|
| 870 |
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
evaluation/metric7_8_precision_MRR.py
CHANGED
|
@@ -6,6 +6,12 @@ OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
|
|
| 6 |
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
| 7 |
using data collected from latency_evaluator.py comprehensive evaluation.
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
METRICS CALCULATED:
|
| 10 |
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
|
| 11 |
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
|
@@ -18,6 +24,7 @@ DESIGN PRINCIPLE:
|
|
| 18 |
|
| 19 |
Author: YanBo Chen
|
| 20 |
Date: 2025-08-04
|
|
|
|
| 21 |
"""
|
| 22 |
|
| 23 |
import json
|
|
@@ -121,8 +128,8 @@ class PrecisionMRRAnalyzer:
|
|
| 121 |
# Step 1: Determine query complexity
|
| 122 |
is_complex = self._is_complex_query(query, processed_results)
|
| 123 |
|
| 124 |
-
# Step 2: Choose adaptive threshold
|
| 125 |
-
threshold = 0.
|
| 126 |
|
| 127 |
print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
| 128 |
|
|
|
|
| 6 |
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
| 7 |
using data collected from latency_evaluator.py comprehensive evaluation.
|
| 8 |
|
| 9 |
+
IMPORTANT CHANGES - Angular Distance & Relevance Calculation:
|
| 10 |
+
- DISTANCE METRIC: Uses Angular Distance from Annoy index (range: 0.0-1.0, smaller = more relevant)
|
| 11 |
+
- RELEVANCE CONVERSION: relevance = 1.0 - (angular_distance²) / 2.0 (mathematical correct formula)
|
| 12 |
+
- THRESHOLD ALIGNMENT: Aligned with Metric 3 relevance calculation standards
|
| 13 |
+
- DISPLAY UPDATE: Changed from "Relevance: X" to "Angular Distance: X" for clarity
|
| 14 |
+
|
| 15 |
METRICS CALCULATED:
|
| 16 |
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
|
| 17 |
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
|
|
|
| 24 |
|
| 25 |
Author: YanBo Chen
|
| 26 |
Date: 2025-08-04
|
| 27 |
+
Updated: 2025-08-04 (Angular Distance alignment)
|
| 28 |
"""
|
| 29 |
|
| 30 |
import json
|
|
|
|
| 128 |
# Step 1: Determine query complexity
|
| 129 |
is_complex = self._is_complex_query(query, processed_results)
|
| 130 |
|
| 131 |
+
# Step 2: Choose adaptive threshold (aligned with Metric 3 relevance calculation)
|
| 132 |
+
threshold = 0.75 if is_complex else 0.8
|
| 133 |
|
| 134 |
print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
| 135 |
|
src/generation.py
CHANGED
|
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
|
|
| 30 |
|
| 31 |
# Fallback Generation Configuration (Simplified Architecture)
|
| 32 |
FALLBACK_TIMEOUTS = {
|
| 33 |
-
"primary":
|
| 34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
| 35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
| 36 |
}
|
|
@@ -279,7 +279,7 @@ class MedicalAdviceGenerator:
|
|
| 279 |
|
| 280 |
# Format each chunk with metadata
|
| 281 |
context_part = f"""
|
| 282 |
-
[Guideline {i}] (Source: {chunk_type.title()},
|
| 283 |
{chunk_text}
|
| 284 |
""".strip()
|
| 285 |
|
|
|
|
| 30 |
|
| 31 |
# Fallback Generation Configuration (Simplified Architecture)
|
| 32 |
FALLBACK_TIMEOUTS = {
|
| 33 |
+
"primary": 60.0, # Primary Med42-70B increased timeout for stable evaluation
|
| 34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
| 35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
| 36 |
}
|
|
|
|
| 279 |
|
| 280 |
# Format each chunk with metadata
|
| 281 |
context_part = f"""
|
| 282 |
+
[Guideline {i}] (Source: {chunk_type.title()}, Angular Distance: {distance:.3f})
|
| 283 |
{chunk_text}
|
| 284 |
""".strip()
|
| 285 |
|