Spaces:
Runtime error
Runtime error
File size: 6,741 Bytes
c3efd49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
"""Comparison and reporting functionality for benchmarks."""
import numpy as np
from typing import Dict, Any, List, Optional
from scipy import stats
import logging
logger = logging.getLogger(__name__)
class BenchmarkComparison:
"""
Compares benchmark results and generates reports.
Computes improvement deltas and statistical significance.
"""
def __init__(self):
"""Initialize comparison tool."""
pass
def compare_results(
self,
baseline: Dict[str, Any],
trained: Dict[str, Any]
) -> Dict[str, Any]:
"""
Compare baseline and trained model results.
Args:
baseline: Baseline benchmark results
trained: Trained model benchmark results
Returns:
Comparison dictionary with deltas and significance
"""
comparison = {
'baseline': baseline,
'trained': trained,
'deltas': {},
'improvements': {},
'statistical_significance': {}
}
# Compute deltas for all numeric metrics
metric_keys = set(baseline.keys()) & set(trained.keys())
for key in metric_keys:
if isinstance(baseline.get(key), (int, float)) and isinstance(trained.get(key), (int, float)):
baseline_val = baseline[key]
trained_val = trained[key]
# Compute delta
delta = trained_val - baseline_val
comparison['deltas'][key] = delta
# Determine if this is an improvement
# For error rates, lower is better
if 'error' in key.lower() or 'distortion' in key.lower():
is_improvement = delta < 0
improvement_pct = -100 * delta / baseline_val if baseline_val != 0 else 0
else:
# For quality scores, higher is better
is_improvement = delta > 0
improvement_pct = 100 * delta / baseline_val if baseline_val != 0 else 0
comparison['improvements'][key] = {
'improved': is_improvement,
'delta': delta,
'percent_change': improvement_pct
}
return comparison
def compute_statistical_significance(
self,
baseline_samples: List[float],
trained_samples: List[float],
alpha: float = 0.05
) -> Dict[str, Any]:
"""
Compute statistical significance of improvement.
Uses paired t-test to determine if difference is significant.
Args:
baseline_samples: Baseline metric values
trained_samples: Trained model metric values
alpha: Significance level
Returns:
Dictionary with test results
"""
if len(baseline_samples) != len(trained_samples):
raise ValueError("Sample lists must have same length")
if len(baseline_samples) < 2:
return {
'significant': False,
'p_value': 1.0,
'test': 'insufficient_data'
}
# Perform paired t-test
t_statistic, p_value = stats.ttest_rel(baseline_samples, trained_samples)
is_significant = p_value < alpha
return {
'significant': bool(is_significant),
'p_value': float(p_value),
't_statistic': float(t_statistic),
'alpha': alpha,
'test': 'paired_t_test',
'n_samples': len(baseline_samples)
}
def rank_improvements(
self,
comparison: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""
Rank metrics by improvement magnitude.
Args:
comparison: Comparison dictionary from compare_results
Returns:
List of improvements sorted by magnitude
"""
improvements = comparison.get('improvements', {})
ranked = []
for metric, info in improvements.items():
ranked.append({
'metric': metric,
'improved': info['improved'],
'delta': info['delta'],
'percent_change': info['percent_change']
})
# Sort by absolute percent change
ranked.sort(key=lambda x: abs(x['percent_change']), reverse=True)
return ranked
def generate_summary_report(
self,
comparison: Dict[str, Any],
significance_results: Optional[Dict[str, Dict]] = None
) -> str:
"""
Generate human-readable summary report.
Args:
comparison: Comparison dictionary
significance_results: Optional statistical significance results per metric
Returns:
Formatted report string
"""
lines = []
lines.append("=" * 60)
lines.append("BENCHMARK COMPARISON REPORT")
lines.append("=" * 60)
lines.append("")
# Model info
baseline = comparison.get('baseline', {})
trained = comparison.get('trained', {})
lines.append(f"Baseline Model: {baseline.get('model_name', 'Unknown')}")
lines.append(f"Trained Model: {trained.get('model_name', 'Unknown')}")
lines.append(f"Baseline Timestamp: {baseline.get('timestamp', 'Unknown')}")
lines.append(f"Trained Timestamp: {trained.get('timestamp', 'Unknown')}")
lines.append("")
# Improvements
lines.append("IMPROVEMENTS:")
lines.append("-" * 60)
ranked = self.rank_improvements(comparison)
for item in ranked:
metric = item['metric']
delta = item['delta']
pct = item['percent_change']
improved = item['improved']
status = "✓ IMPROVED" if improved else "✗ REGRESSED"
sig_marker = ""
if significance_results and metric in significance_results:
if significance_results[metric].get('significant'):
sig_marker = " *"
lines.append(f"{metric:40s} {status:12s} {delta:+10.4f} ({pct:+6.2f}%){sig_marker}")
if significance_results:
lines.append("")
lines.append("* Statistically significant at α=0.05")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)
|