mbellan's picture
Initial deployment
c3efd49
"""Comparison and reporting functionality for benchmarks."""
import numpy as np
from typing import Dict, Any, List, Optional
from scipy import stats
import logging
logger = logging.getLogger(__name__)
class BenchmarkComparison:
"""
Compares benchmark results and generates reports.
Computes improvement deltas and statistical significance.
"""
def __init__(self):
"""Initialize comparison tool."""
pass
def compare_results(
self,
baseline: Dict[str, Any],
trained: Dict[str, Any]
) -> Dict[str, Any]:
"""
Compare baseline and trained model results.
Args:
baseline: Baseline benchmark results
trained: Trained model benchmark results
Returns:
Comparison dictionary with deltas and significance
"""
comparison = {
'baseline': baseline,
'trained': trained,
'deltas': {},
'improvements': {},
'statistical_significance': {}
}
# Compute deltas for all numeric metrics
metric_keys = set(baseline.keys()) & set(trained.keys())
for key in metric_keys:
if isinstance(baseline.get(key), (int, float)) and isinstance(trained.get(key), (int, float)):
baseline_val = baseline[key]
trained_val = trained[key]
# Compute delta
delta = trained_val - baseline_val
comparison['deltas'][key] = delta
# Determine if this is an improvement
# For error rates, lower is better
if 'error' in key.lower() or 'distortion' in key.lower():
is_improvement = delta < 0
improvement_pct = -100 * delta / baseline_val if baseline_val != 0 else 0
else:
# For quality scores, higher is better
is_improvement = delta > 0
improvement_pct = 100 * delta / baseline_val if baseline_val != 0 else 0
comparison['improvements'][key] = {
'improved': is_improvement,
'delta': delta,
'percent_change': improvement_pct
}
return comparison
def compute_statistical_significance(
self,
baseline_samples: List[float],
trained_samples: List[float],
alpha: float = 0.05
) -> Dict[str, Any]:
"""
Compute statistical significance of improvement.
Uses paired t-test to determine if difference is significant.
Args:
baseline_samples: Baseline metric values
trained_samples: Trained model metric values
alpha: Significance level
Returns:
Dictionary with test results
"""
if len(baseline_samples) != len(trained_samples):
raise ValueError("Sample lists must have same length")
if len(baseline_samples) < 2:
return {
'significant': False,
'p_value': 1.0,
'test': 'insufficient_data'
}
# Perform paired t-test
t_statistic, p_value = stats.ttest_rel(baseline_samples, trained_samples)
is_significant = p_value < alpha
return {
'significant': bool(is_significant),
'p_value': float(p_value),
't_statistic': float(t_statistic),
'alpha': alpha,
'test': 'paired_t_test',
'n_samples': len(baseline_samples)
}
def rank_improvements(
self,
comparison: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""
Rank metrics by improvement magnitude.
Args:
comparison: Comparison dictionary from compare_results
Returns:
List of improvements sorted by magnitude
"""
improvements = comparison.get('improvements', {})
ranked = []
for metric, info in improvements.items():
ranked.append({
'metric': metric,
'improved': info['improved'],
'delta': info['delta'],
'percent_change': info['percent_change']
})
# Sort by absolute percent change
ranked.sort(key=lambda x: abs(x['percent_change']), reverse=True)
return ranked
def generate_summary_report(
self,
comparison: Dict[str, Any],
significance_results: Optional[Dict[str, Dict]] = None
) -> str:
"""
Generate human-readable summary report.
Args:
comparison: Comparison dictionary
significance_results: Optional statistical significance results per metric
Returns:
Formatted report string
"""
lines = []
lines.append("=" * 60)
lines.append("BENCHMARK COMPARISON REPORT")
lines.append("=" * 60)
lines.append("")
# Model info
baseline = comparison.get('baseline', {})
trained = comparison.get('trained', {})
lines.append(f"Baseline Model: {baseline.get('model_name', 'Unknown')}")
lines.append(f"Trained Model: {trained.get('model_name', 'Unknown')}")
lines.append(f"Baseline Timestamp: {baseline.get('timestamp', 'Unknown')}")
lines.append(f"Trained Timestamp: {trained.get('timestamp', 'Unknown')}")
lines.append("")
# Improvements
lines.append("IMPROVEMENTS:")
lines.append("-" * 60)
ranked = self.rank_improvements(comparison)
for item in ranked:
metric = item['metric']
delta = item['delta']
pct = item['percent_change']
improved = item['improved']
status = "✓ IMPROVED" if improved else "✗ REGRESSED"
sig_marker = ""
if significance_results and metric in significance_results:
if significance_results[metric].get('significant'):
sig_marker = " *"
lines.append(f"{metric:40s} {status:12s} {delta:+10.4f} ({pct:+6.2f}%){sig_marker}")
if significance_results:
lines.append("")
lines.append("* Statistically significant at α=0.05")
lines.append("")
lines.append("=" * 60)
return "\n".join(lines)