File size: 6,741 Bytes
c3efd49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""Comparison and reporting functionality for benchmarks."""
import numpy as np
from typing import Dict, Any, List, Optional
from scipy import stats
import logging

logger = logging.getLogger(__name__)


class BenchmarkComparison:
    """
    Compares benchmark results and generates reports.
    
    Computes improvement deltas and statistical significance.
    """
    
    def __init__(self):
        """Initialize comparison tool."""
        pass
    
    def compare_results(
        self,
        baseline: Dict[str, Any],
        trained: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Compare baseline and trained model results.
        
        Args:
            baseline: Baseline benchmark results
            trained: Trained model benchmark results
        
        Returns:
            Comparison dictionary with deltas and significance
        """
        comparison = {
            'baseline': baseline,
            'trained': trained,
            'deltas': {},
            'improvements': {},
            'statistical_significance': {}
        }
        
        # Compute deltas for all numeric metrics
        metric_keys = set(baseline.keys()) & set(trained.keys())
        
        for key in metric_keys:
            if isinstance(baseline.get(key), (int, float)) and isinstance(trained.get(key), (int, float)):
                baseline_val = baseline[key]
                trained_val = trained[key]
                
                # Compute delta
                delta = trained_val - baseline_val
                comparison['deltas'][key] = delta
                
                # Determine if this is an improvement
                # For error rates, lower is better
                if 'error' in key.lower() or 'distortion' in key.lower():
                    is_improvement = delta < 0
                    improvement_pct = -100 * delta / baseline_val if baseline_val != 0 else 0
                else:
                    # For quality scores, higher is better
                    is_improvement = delta > 0
                    improvement_pct = 100 * delta / baseline_val if baseline_val != 0 else 0
                
                comparison['improvements'][key] = {
                    'improved': is_improvement,
                    'delta': delta,
                    'percent_change': improvement_pct
                }
        
        return comparison
    
    def compute_statistical_significance(
        self,
        baseline_samples: List[float],
        trained_samples: List[float],
        alpha: float = 0.05
    ) -> Dict[str, Any]:
        """
        Compute statistical significance of improvement.
        
        Uses paired t-test to determine if difference is significant.
        
        Args:
            baseline_samples: Baseline metric values
            trained_samples: Trained model metric values
            alpha: Significance level
        
        Returns:
            Dictionary with test results
        """
        if len(baseline_samples) != len(trained_samples):
            raise ValueError("Sample lists must have same length")
        
        if len(baseline_samples) < 2:
            return {
                'significant': False,
                'p_value': 1.0,
                'test': 'insufficient_data'
            }
        
        # Perform paired t-test
        t_statistic, p_value = stats.ttest_rel(baseline_samples, trained_samples)
        
        is_significant = p_value < alpha
        
        return {
            'significant': bool(is_significant),
            'p_value': float(p_value),
            't_statistic': float(t_statistic),
            'alpha': alpha,
            'test': 'paired_t_test',
            'n_samples': len(baseline_samples)
        }
    
    def rank_improvements(
        self,
        comparison: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        """
        Rank metrics by improvement magnitude.
        
        Args:
            comparison: Comparison dictionary from compare_results
        
        Returns:
            List of improvements sorted by magnitude
        """
        improvements = comparison.get('improvements', {})
        
        ranked = []
        for metric, info in improvements.items():
            ranked.append({
                'metric': metric,
                'improved': info['improved'],
                'delta': info['delta'],
                'percent_change': info['percent_change']
            })
        
        # Sort by absolute percent change
        ranked.sort(key=lambda x: abs(x['percent_change']), reverse=True)
        
        return ranked
    
    def generate_summary_report(
        self,
        comparison: Dict[str, Any],
        significance_results: Optional[Dict[str, Dict]] = None
    ) -> str:
        """
        Generate human-readable summary report.
        
        Args:
            comparison: Comparison dictionary
            significance_results: Optional statistical significance results per metric
        
        Returns:
            Formatted report string
        """
        lines = []
        lines.append("=" * 60)
        lines.append("BENCHMARK COMPARISON REPORT")
        lines.append("=" * 60)
        lines.append("")
        
        # Model info
        baseline = comparison.get('baseline', {})
        trained = comparison.get('trained', {})
        
        lines.append(f"Baseline Model: {baseline.get('model_name', 'Unknown')}")
        lines.append(f"Trained Model: {trained.get('model_name', 'Unknown')}")
        lines.append(f"Baseline Timestamp: {baseline.get('timestamp', 'Unknown')}")
        lines.append(f"Trained Timestamp: {trained.get('timestamp', 'Unknown')}")
        lines.append("")
        
        # Improvements
        lines.append("IMPROVEMENTS:")
        lines.append("-" * 60)
        
        ranked = self.rank_improvements(comparison)
        
        for item in ranked:
            metric = item['metric']
            delta = item['delta']
            pct = item['percent_change']
            improved = item['improved']
            
            status = "✓ IMPROVED" if improved else "✗ REGRESSED"
            
            sig_marker = ""
            if significance_results and metric in significance_results:
                if significance_results[metric].get('significant'):
                    sig_marker = " *"
            
            lines.append(f"{metric:40s} {status:12s} {delta:+10.4f} ({pct:+6.2f}%){sig_marker}")
        
        if significance_results:
            lines.append("")
            lines.append("* Statistically significant at α=0.05")
        
        lines.append("")
        lines.append("=" * 60)
        
        return "\n".join(lines)