Spaces:
Runtime error
Runtime error
| """Comparison and reporting functionality for benchmarks.""" | |
| import numpy as np | |
| from typing import Dict, Any, List, Optional | |
| from scipy import stats | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class BenchmarkComparison: | |
| """ | |
| Compares benchmark results and generates reports. | |
| Computes improvement deltas and statistical significance. | |
| """ | |
| def __init__(self): | |
| """Initialize comparison tool.""" | |
| pass | |
| def compare_results( | |
| self, | |
| baseline: Dict[str, Any], | |
| trained: Dict[str, Any] | |
| ) -> Dict[str, Any]: | |
| """ | |
| Compare baseline and trained model results. | |
| Args: | |
| baseline: Baseline benchmark results | |
| trained: Trained model benchmark results | |
| Returns: | |
| Comparison dictionary with deltas and significance | |
| """ | |
| comparison = { | |
| 'baseline': baseline, | |
| 'trained': trained, | |
| 'deltas': {}, | |
| 'improvements': {}, | |
| 'statistical_significance': {} | |
| } | |
| # Compute deltas for all numeric metrics | |
| metric_keys = set(baseline.keys()) & set(trained.keys()) | |
| for key in metric_keys: | |
| if isinstance(baseline.get(key), (int, float)) and isinstance(trained.get(key), (int, float)): | |
| baseline_val = baseline[key] | |
| trained_val = trained[key] | |
| # Compute delta | |
| delta = trained_val - baseline_val | |
| comparison['deltas'][key] = delta | |
| # Determine if this is an improvement | |
| # For error rates, lower is better | |
| if 'error' in key.lower() or 'distortion' in key.lower(): | |
| is_improvement = delta < 0 | |
| improvement_pct = -100 * delta / baseline_val if baseline_val != 0 else 0 | |
| else: | |
| # For quality scores, higher is better | |
| is_improvement = delta > 0 | |
| improvement_pct = 100 * delta / baseline_val if baseline_val != 0 else 0 | |
| comparison['improvements'][key] = { | |
| 'improved': is_improvement, | |
| 'delta': delta, | |
| 'percent_change': improvement_pct | |
| } | |
| return comparison | |
| def compute_statistical_significance( | |
| self, | |
| baseline_samples: List[float], | |
| trained_samples: List[float], | |
| alpha: float = 0.05 | |
| ) -> Dict[str, Any]: | |
| """ | |
| Compute statistical significance of improvement. | |
| Uses paired t-test to determine if difference is significant. | |
| Args: | |
| baseline_samples: Baseline metric values | |
| trained_samples: Trained model metric values | |
| alpha: Significance level | |
| Returns: | |
| Dictionary with test results | |
| """ | |
| if len(baseline_samples) != len(trained_samples): | |
| raise ValueError("Sample lists must have same length") | |
| if len(baseline_samples) < 2: | |
| return { | |
| 'significant': False, | |
| 'p_value': 1.0, | |
| 'test': 'insufficient_data' | |
| } | |
| # Perform paired t-test | |
| t_statistic, p_value = stats.ttest_rel(baseline_samples, trained_samples) | |
| is_significant = p_value < alpha | |
| return { | |
| 'significant': bool(is_significant), | |
| 'p_value': float(p_value), | |
| 't_statistic': float(t_statistic), | |
| 'alpha': alpha, | |
| 'test': 'paired_t_test', | |
| 'n_samples': len(baseline_samples) | |
| } | |
| def rank_improvements( | |
| self, | |
| comparison: Dict[str, Any] | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Rank metrics by improvement magnitude. | |
| Args: | |
| comparison: Comparison dictionary from compare_results | |
| Returns: | |
| List of improvements sorted by magnitude | |
| """ | |
| improvements = comparison.get('improvements', {}) | |
| ranked = [] | |
| for metric, info in improvements.items(): | |
| ranked.append({ | |
| 'metric': metric, | |
| 'improved': info['improved'], | |
| 'delta': info['delta'], | |
| 'percent_change': info['percent_change'] | |
| }) | |
| # Sort by absolute percent change | |
| ranked.sort(key=lambda x: abs(x['percent_change']), reverse=True) | |
| return ranked | |
| def generate_summary_report( | |
| self, | |
| comparison: Dict[str, Any], | |
| significance_results: Optional[Dict[str, Dict]] = None | |
| ) -> str: | |
| """ | |
| Generate human-readable summary report. | |
| Args: | |
| comparison: Comparison dictionary | |
| significance_results: Optional statistical significance results per metric | |
| Returns: | |
| Formatted report string | |
| """ | |
| lines = [] | |
| lines.append("=" * 60) | |
| lines.append("BENCHMARK COMPARISON REPORT") | |
| lines.append("=" * 60) | |
| lines.append("") | |
| # Model info | |
| baseline = comparison.get('baseline', {}) | |
| trained = comparison.get('trained', {}) | |
| lines.append(f"Baseline Model: {baseline.get('model_name', 'Unknown')}") | |
| lines.append(f"Trained Model: {trained.get('model_name', 'Unknown')}") | |
| lines.append(f"Baseline Timestamp: {baseline.get('timestamp', 'Unknown')}") | |
| lines.append(f"Trained Timestamp: {trained.get('timestamp', 'Unknown')}") | |
| lines.append("") | |
| # Improvements | |
| lines.append("IMPROVEMENTS:") | |
| lines.append("-" * 60) | |
| ranked = self.rank_improvements(comparison) | |
| for item in ranked: | |
| metric = item['metric'] | |
| delta = item['delta'] | |
| pct = item['percent_change'] | |
| improved = item['improved'] | |
| status = "✓ IMPROVED" if improved else "✗ REGRESSED" | |
| sig_marker = "" | |
| if significance_results and metric in significance_results: | |
| if significance_results[metric].get('significant'): | |
| sig_marker = " *" | |
| lines.append(f"{metric:40s} {status:12s} {delta:+10.4f} ({pct:+6.2f}%){sig_marker}") | |
| if significance_results: | |
| lines.append("") | |
| lines.append("* Statistically significant at α=0.05") | |
| lines.append("") | |
| lines.append("=" * 60) | |
| return "\n".join(lines) | |