ardaatahan commited on
Commit
6921fc0
·
1 Parent(s): 6b28c2b

Update logic and add tests

Browse files
.github/scripts/test_wer_regression_check.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for WER regression detection
4
+ Tests all regression detection functions with synthetic and real data
5
+ """
6
+
7
+ import json
8
+ import sys
9
+ from wer_regression_check import (
10
+ detect_device_regressions,
11
+ detect_os_regressions,
12
+ detect_release_regressions,
13
+ detect_speed_device_regressions,
14
+ detect_speed_os_regressions,
15
+ detect_speed_release_regressions,
16
+ detect_tokens_device_regressions,
17
+ detect_tokens_os_regressions,
18
+ detect_tokens_release_regressions,
19
+ generate_slack_message,
20
+ load_performance_data
21
+ )
22
+
23
+
24
+ def test_wer_detection_with_synthetic_data():
25
+ """Test WER detection with known synthetic data"""
26
+ print("\n" + "="*80)
27
+ print("TEST 1: WER Detection with Synthetic Data")
28
+ print("="*80)
29
+
30
+ # Create synthetic data where we know there should be regressions
31
+ # Historical data (best performances)
32
+ historical_data = [
33
+ # Model A: iPhone has best WER of 10%
34
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0},
35
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0},
36
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.2, "speed": 9.8, "tokens_per_second": 49.0},
37
+
38
+ # Model B: iOS 17 has best WER of 10%
39
+ {"model": "model-b", "device": "iPhone 15", "os": "iOS 17", "average_wer": 10.0, "speed": 20.0, "tokens_per_second": 100.0},
40
+ {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 19.0, "tokens_per_second": 95.0},
41
+
42
+ # Model C: No regression scenario
43
+ {"model": "model-c", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0},
44
+ {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0},
45
+ ]
46
+
47
+ # Current data (latest release with regressions)
48
+ current_data = [
49
+ # Model A: iPad Pro has regressed to 15% WER (50% worse than best 10%)
50
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 15.0, "speed": 8.0, "tokens_per_second": 40.0},
51
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.3, "speed": 9.7, "tokens_per_second": 49.5},
52
+
53
+ # Model B: iOS 18 has regressed to 13% WER (30% worse than best 10%)
54
+ {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 13.0, "speed": 15.0, "tokens_per_second": 75.0},
55
+
56
+ # Model C: Still within 20% (11% vs best 10%)
57
+ {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 9.0, "tokens_per_second": 45.0},
58
+ ]
59
+
60
+ # Test device regressions
61
+ device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0)
62
+ print(f"\n✓ Device WER Regressions Found: {len(device_regressions)}")
63
+
64
+ # Debug: print all found regressions
65
+ for r in device_regressions:
66
+ print(f" - {r['model']}: {r['device']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)")
67
+
68
+ # Model A should trigger (iPad Pro is ~40% worse than iPhone)
69
+ # Model C should NOT trigger (iPad Pro is only 10% worse)
70
+ assert len(device_regressions) >= 1, f"Expected at least 1 device regression, got {len(device_regressions)}"
71
+
72
+ # Verify model-a is in the regressions
73
+ model_a_regressions = [r for r in device_regressions if r["model"] == "model-a"]
74
+ assert len(model_a_regressions) > 0, "Expected model-a to have device regression"
75
+ print(f"\n✓ Model-a correctly flagged for device regression")
76
+
77
+ # Test OS regressions
78
+ os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0)
79
+ print(f"\n✓ OS WER Regressions Found: {len(os_regressions)}")
80
+
81
+ # Debug: print all found OS regressions
82
+ for r in os_regressions:
83
+ print(f" - {r['model']}: {r['os']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)")
84
+
85
+ assert len(os_regressions) >= 1, f"Expected at least 1 OS regression, got {len(os_regressions)}"
86
+
87
+ # Verify model-b is in the regressions
88
+ model_b_regressions = [r for r in os_regressions if r["model"] == "model-b"]
89
+ assert len(model_b_regressions) > 0, "Expected model-b to have OS regression"
90
+ print(f"\n✓ Model-b correctly flagged for OS regression")
91
+
92
+ print("\n✅ TEST 1 PASSED: WER detection works correctly with synthetic data")
93
+ return True
94
+
95
+
96
+ def test_speed_detection_with_synthetic_data():
97
+ """Test speed detection with known synthetic data"""
98
+ print("\n" + "="*80)
99
+ print("TEST 2: Speed Detection with Synthetic Data")
100
+ print("="*80)
101
+
102
+ # Historical data (best performances)
103
+ historical_data = [
104
+ # Model A: iPhone has best speed of 100
105
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 200.0},
106
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 190.0},
107
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 98.0, "tokens_per_second": 195.0},
108
+ ]
109
+
110
+ # Current data (with speed regression)
111
+ current_data = [
112
+ # Model A: iPad Pro has regressed to 60 speed (40% slower than best 100)
113
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 60.0, "tokens_per_second": 120.0},
114
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.2, "speed": 97.0, "tokens_per_second": 195.0},
115
+ ]
116
+
117
+ # Test device speed regressions
118
+ speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0)
119
+ print(f"\n✓ Device Speed Regressions Found: {len(speed_device_regressions)}")
120
+ assert len(speed_device_regressions) == 1, f"Expected 1 speed device regression, got {len(speed_device_regressions)}"
121
+ print(f" - {speed_device_regressions[0]['model']}: {speed_device_regressions[0]['device']} has {speed_device_regressions[0]['current_value']}x speed vs best {speed_device_regressions[0]['best_value']}x")
122
+
123
+ print("\n✅ TEST 2 PASSED: Speed detection works correctly with synthetic data")
124
+ return True
125
+
126
+
127
+ def test_tokens_detection_with_synthetic_data():
128
+ """Test tokens per second detection with known synthetic data"""
129
+ print("\n" + "="*80)
130
+ print("TEST 3: Tokens/Second Detection with Synthetic Data")
131
+ print("="*80)
132
+
133
+ # Historical data (best performances)
134
+ historical_data = [
135
+ # Model A: iPhone has best tokens/sec of 500
136
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
137
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 98.0, "tokens_per_second": 490.0},
138
+ ]
139
+
140
+ # Current data (with tokens/sec regression)
141
+ current_data = [
142
+ # Model A: iPad Pro has regressed to 300 tokens/sec (40% slower than best 500)
143
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 80.0, "tokens_per_second": 300.0},
144
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.1, "speed": 99.0, "tokens_per_second": 495.0},
145
+ ]
146
+
147
+ # Test device tokens regressions
148
+ tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0)
149
+ print(f"\n✓ Device Tokens/Sec Regressions Found: {len(tokens_device_regressions)}")
150
+ assert len(tokens_device_regressions) == 1, f"Expected 1 tokens device regression, got {len(tokens_device_regressions)}"
151
+ print(f" - {tokens_device_regressions[0]['model']}: {tokens_device_regressions[0]['device']} has {tokens_device_regressions[0]['current_value']} tokens/sec vs best {tokens_device_regressions[0]['best_value']}")
152
+
153
+ print("\n✅ TEST 3 PASSED: Tokens/sec detection works correctly with synthetic data")
154
+ return True
155
+
156
+
157
+ def test_release_regression_detection():
158
+ """Test release-to-release regression detection"""
159
+ print("\n" + "="*80)
160
+ print("TEST 4: Release-to-Release Regression Detection")
161
+ print("="*80)
162
+
163
+ # Previous release data (best performance)
164
+ previous_data = [
165
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
166
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 490.0},
167
+ ]
168
+
169
+ # Current release data (degraded performance - 50% worse)
170
+ current_data = [
171
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 15.0, "speed": 60.0, "tokens_per_second": 300.0},
172
+ ]
173
+
174
+ # Test WER release regression
175
+ wer_release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0)
176
+ print(f"\n✓ WER Release Regressions Found: {len(wer_release_regressions)}")
177
+ assert len(wer_release_regressions) == 1, f"Expected 1 WER release regression, got {len(wer_release_regressions)}"
178
+ print(f" - {wer_release_regressions[0]['model']}: WER increased from {wer_release_regressions[0]['best_historical_value']}% to {wer_release_regressions[0]['current_value']}%")
179
+
180
+ # Test speed release regression
181
+ speed_release_regressions = detect_speed_release_regressions(current_data, previous_data, threshold=20.0)
182
+ print(f"\n✓ Speed Release Regressions Found: {len(speed_release_regressions)}")
183
+ assert len(speed_release_regressions) == 1, f"Expected 1 speed release regression, got {len(speed_release_regressions)}"
184
+ print(f" - {speed_release_regressions[0]['model']}: Speed decreased from {speed_release_regressions[0]['best_historical_value']}x to {speed_release_regressions[0]['current_value']}x")
185
+
186
+ # Test tokens release regression
187
+ tokens_release_regressions = detect_tokens_release_regressions(current_data, previous_data, threshold=20.0)
188
+ print(f"\n✓ Tokens/Sec Release Regressions Found: {len(tokens_release_regressions)}")
189
+ assert len(tokens_release_regressions) == 1, f"Expected 1 tokens release regression, got {len(tokens_release_regressions)}"
190
+ print(f" - {tokens_release_regressions[0]['model']}: Tokens/sec decreased from {tokens_release_regressions[0]['best_historical_value']} to {tokens_release_regressions[0]['current_value']}")
191
+
192
+ print("\n✅ TEST 4 PASSED: Release-to-release regression detection works correctly")
193
+ return True
194
+
195
+
196
+ def test_slack_message_generation():
197
+ """Test Slack message generation"""
198
+ print("\n" + "="*80)
199
+ print("TEST 5: Slack Message Generation")
200
+ print("="*80)
201
+
202
+ # Create sample regressions
203
+ sample_regressions = [
204
+ {
205
+ "type": "device_wer_discrepancy",
206
+ "metric": "WER",
207
+ "model": "test-model",
208
+ "device": "iPad Pro",
209
+ "current_value": 35.0,
210
+ "best_value": 25.0,
211
+ "best_device": "iPhone 15",
212
+ "best_os": "iOS 18",
213
+ "percentage_diff": 40.0
214
+ },
215
+ {
216
+ "type": "device_speed_discrepancy",
217
+ "metric": "Speed",
218
+ "model": "test-model",
219
+ "device": "iPad Pro",
220
+ "current_value": 60.0,
221
+ "best_value": 100.0,
222
+ "best_device": "iPhone 15",
223
+ "best_os": "iOS 18",
224
+ "percentage_diff": 40.0
225
+ }
226
+ ]
227
+
228
+ # Generate Slack message
229
+ slack_payload = generate_slack_message(sample_regressions)
230
+
231
+ assert slack_payload is not None, "Expected Slack payload to be generated"
232
+ assert "blocks" in slack_payload, "Expected 'blocks' in Slack payload"
233
+ assert len(slack_payload["blocks"]) > 0, "Expected at least one block in Slack payload"
234
+
235
+ print(f"\n✓ Slack Message Generated Successfully")
236
+ print(f" - Total blocks: {len(slack_payload['blocks'])}")
237
+ print(f"\n📧 Full Slack Message Payload:")
238
+ print("=" * 80)
239
+ print(json.dumps(slack_payload, indent=2))
240
+ print("=" * 80)
241
+
242
+ print("\n✅ TEST 5 PASSED: Slack message generation works correctly")
243
+ return True
244
+
245
+
246
+ def test_edge_cases():
247
+ """Test edge cases"""
248
+ print("\n" + "="*80)
249
+ print("TEST 6: Edge Cases")
250
+ print("="*80)
251
+
252
+ # Test with single data point (should not trigger any regressions - no historical comparison)
253
+ single_current = [
254
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
255
+ ]
256
+ empty_historical = []
257
+
258
+ device_regressions = detect_device_regressions(single_current, empty_historical, threshold=20.0)
259
+ assert len(device_regressions) == 0, f"Expected 0 regressions with no historical data, got {len(device_regressions)}"
260
+ print("✓ Single data point with no historical data handled correctly (no regressions)")
261
+
262
+ # Test with empty current data
263
+ empty_regressions = detect_device_regressions([], single_current, threshold=20.0)
264
+ assert len(empty_regressions) == 0, "Expected 0 regressions with empty current data"
265
+ print("✓ Empty current data handled correctly")
266
+
267
+ # Test with missing fields (tokens_per_second missing)
268
+ partial_historical = [
269
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0},
270
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0},
271
+ ]
272
+ partial_current = [
273
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 30.0, "speed": 80.0},
274
+ ]
275
+
276
+ # Should still work for WER and speed
277
+ device_regressions = detect_device_regressions(partial_current, partial_historical, threshold=20.0)
278
+ print(f"✓ Partial data (missing tokens) handled correctly: {len(device_regressions)} WER regressions found")
279
+
280
+ # Should not crash for tokens
281
+ tokens_regressions = detect_tokens_device_regressions(partial_current, partial_historical, threshold=20.0)
282
+ assert len(tokens_regressions) == 0, "Expected 0 tokens regressions when field is missing"
283
+ print("✓ Missing tokens_per_second field handled gracefully")
284
+
285
+ print("\n✅ TEST 6 PASSED: Edge cases handled correctly")
286
+ return True
287
+
288
+
289
+ def test_with_real_data_sample():
290
+ """Test with a small sample of real data to verify calculations"""
291
+ print("\n" + "="*80)
292
+ print("TEST 7: Real Data Sample Verification")
293
+ print("="*80)
294
+
295
+ try:
296
+ # Load a sample of real data
297
+ real_data = load_performance_data("dashboard_data/performance_data.json")
298
+
299
+ if len(real_data) == 0:
300
+ print("⚠️ No real data found, skipping this test")
301
+ return True
302
+
303
+ print(f"✓ Loaded {len(real_data)} real data points")
304
+
305
+ # Get unique models
306
+ models = set(entry["model"] for entry in real_data)
307
+ print(f"✓ Found {len(models)} unique models")
308
+
309
+ # Split into current (last 10%) and historical (all data) for testing
310
+ split_point = int(len(real_data) * 0.9)
311
+ historical_data = real_data[:split_point] if split_point > 0 else real_data
312
+ current_data = real_data[split_point:] if split_point > 0 else real_data[:10]
313
+
314
+ # Run detection on real data
315
+ device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0)
316
+ os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0)
317
+ speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0)
318
+ tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0)
319
+
320
+ print(f"\n✓ Real Data Analysis:")
321
+ print(f" - WER device regressions: {len(device_regressions)}")
322
+ print(f" - WER OS regressions: {len(os_regressions)}")
323
+ print(f" - Speed device regressions: {len(speed_device_regressions)}")
324
+ print(f" - Tokens device regressions: {len(tokens_device_regressions)}")
325
+
326
+ # Show a few examples if any found
327
+ if device_regressions:
328
+ print(f"\n Example WER regression:")
329
+ r = device_regressions[0]
330
+ print(f" Model: {r['model']}")
331
+ print(f" Device: {r['device']} on {r['os']}")
332
+ print(f" Current: {r['current_value']}% WER")
333
+ print(f" Historical best: {r['best_value']}% WER")
334
+ print(f" Deviation: +{r['percentage_diff']}%")
335
+
336
+ if speed_device_regressions:
337
+ print(f"\n Example Speed regression:")
338
+ r = speed_device_regressions[0]
339
+ print(f" Model: {r['model']}")
340
+ print(f" Device: {r['device']} on {r['os']}")
341
+ print(f" Current: {r['current_value']}x speed")
342
+ print(f" Historical best: {r['best_value']}x speed")
343
+ print(f" Slower by: {r['percentage_diff']}%")
344
+
345
+ print("\n✅ TEST 7 PASSED: Real data processed successfully")
346
+ return True
347
+
348
+ except FileNotFoundError:
349
+ print("⚠️ dashboard_data/performance_data.json not found, skipping real data test")
350
+ return True
351
+ except Exception as e:
352
+ print(f"❌ Error processing real data: {e}")
353
+ return False
354
+
355
+
356
+ def manual_verification_helper():
357
+ """Print data for manual verification"""
358
+ print("\n" + "="*80)
359
+ print("MANUAL VERIFICATION HELPER")
360
+ print("="*80)
361
+
362
+ try:
363
+ real_data = load_performance_data("dashboard_data/performance_data.json")
364
+
365
+ # Pick a model to analyze in detail
366
+ models = {}
367
+ for entry in real_data:
368
+ model = entry["model"]
369
+ if model not in models:
370
+ models[model] = []
371
+ models[model].append(entry)
372
+
373
+ # Find a model with multiple entries
374
+ for model_name, entries in list(models.items())[:3]: # Check first 3 models
375
+ if len(entries) >= 3:
376
+ print(f"\n📊 Model: {model_name}")
377
+ print(f" Total data points: {len(entries)}")
378
+
379
+ # Show WER stats
380
+ wer_values = [e["average_wer"] for e in entries]
381
+ print(f"\n WER Analysis:")
382
+ print(f" - Best (min): {min(wer_values):.2f}%")
383
+ print(f" - Worst (max): {max(wer_values):.2f}%")
384
+ print(f" - Difference: {((max(wer_values) - min(wer_values)) / min(wer_values) * 100):.1f}%")
385
+
386
+ # Show by device
387
+ devices = {}
388
+ for entry in entries:
389
+ device = entry["device"]
390
+ if device not in devices:
391
+ devices[device] = []
392
+ devices[device].append(entry["average_wer"])
393
+
394
+ print(f"\n WER by Device:")
395
+ for device, wers in devices.items():
396
+ avg_wer = sum(wers) / len(wers)
397
+ num_samples = len(wers)
398
+ print(f" - {device}: {avg_wer:.2f}% avg ({num_samples} test runs)")
399
+
400
+ # Show speed stats if available
401
+ if "speed" in entries[0]:
402
+ speed_values = [e["speed"] for e in entries]
403
+ print(f"\n Speed Analysis:")
404
+ print(f" - Best (max): {max(speed_values):.2f}x")
405
+ print(f" - Worst (min): {min(speed_values):.2f}x")
406
+ print(f" - Difference: {((max(speed_values) - min(speed_values)) / max(speed_values) * 100):.1f}%")
407
+
408
+ break
409
+
410
+ print("\n" + "="*80)
411
+ print("Use the above data to manually verify regression detection logic")
412
+ print("="*80)
413
+
414
+ except Exception as e:
415
+ print(f"Could not load data for manual verification: {e}")
416
+
417
+
418
+ def run_all_tests():
419
+ """Run all tests"""
420
+ print("\n" + "="*80)
421
+ print("🧪 RUNNING ALL REGRESSION DETECTION TESTS")
422
+ print("="*80)
423
+
424
+ tests = [
425
+ ("WER Detection (Synthetic)", test_wer_detection_with_synthetic_data),
426
+ ("Speed Detection (Synthetic)", test_speed_detection_with_synthetic_data),
427
+ ("Tokens Detection (Synthetic)", test_tokens_detection_with_synthetic_data),
428
+ ("Release Regression Detection", test_release_regression_detection),
429
+ ("Slack Message Generation", test_slack_message_generation),
430
+ ("Edge Cases", test_edge_cases),
431
+ ("Real Data Sample", test_with_real_data_sample),
432
+ ]
433
+
434
+ passed = 0
435
+ failed = 0
436
+
437
+ for test_name, test_func in tests:
438
+ try:
439
+ if test_func():
440
+ passed += 1
441
+ else:
442
+ failed += 1
443
+ print(f"\n❌ {test_name} FAILED")
444
+ except AssertionError as e:
445
+ failed += 1
446
+ print(f"\n❌ {test_name} FAILED: {e}")
447
+ except Exception as e:
448
+ failed += 1
449
+ print(f"\n❌ {test_name} ERROR: {e}")
450
+ import traceback
451
+ traceback.print_exc()
452
+
453
+ # Print summary
454
+ print("\n" + "="*80)
455
+ print("TEST SUMMARY")
456
+ print("="*80)
457
+ print(f"✅ Passed: {passed}/{len(tests)}")
458
+ print(f"❌ Failed: {failed}/{len(tests)}")
459
+
460
+ if failed == 0:
461
+ print("\n🎉 ALL TESTS PASSED! The implementation is working correctly.")
462
+ print("\nNext steps:")
463
+ print("1. Run manual verification helper to spot-check real data")
464
+ print("2. Test in a non-production environment first")
465
+ print("3. Monitor the first few runs carefully")
466
+ else:
467
+ print(f"\n⚠️ {failed} test(s) failed. Please review and fix issues.")
468
+ return False
469
+
470
+ return True
471
+
472
+
473
+ if __name__ == "__main__":
474
+ success = run_all_tests()
475
+
476
+ # Optionally run manual verification helper
477
+ print("\n" + "="*80)
478
+ response = input("Run manual verification helper? (y/n): ")
479
+ if response.lower() == 'y':
480
+ manual_verification_helper()
481
+
482
+ sys.exit(0 if success else 1)
483
+
.github/scripts/wer_regression_check.py CHANGED
@@ -1,13 +1,14 @@
1
  #!/usr/bin/env python3
2
  """
3
- WhisperKit WER Regression Detection Script
4
 
5
- This script detects significant WER (Word Error Rate) regressions across:
6
- - Different devices
7
- - OS versions
8
- - Previous WhisperKit releases
 
9
 
10
- If any model shows WER discrepancy > 20%, it alerts via Slack.
11
  """
12
 
13
  import json
@@ -51,166 +52,516 @@ def calculate_wer_statistics(wer_values: List[float]) -> Dict[str, float]:
51
  }
52
 
53
 
54
- def detect_device_regressions(data: List[Dict], threshold: float = 20.0) -> List[Dict]:
55
  """
56
- Detect WER regressions across different devices for each model/OS combination.
 
57
  Returns list of regression alerts.
58
  """
59
  regressions = []
60
 
61
- # Group by model and OS, then compare across devices
62
- model_os_data = defaultdict(list)
63
- for entry in data:
64
- model_os_key = (entry["model"], entry["os"])
65
- model_os_data[model_os_key].append(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- for (model, os), entries in model_os_data.items():
68
- if len(entries) < 2: # Need at least 2 data points to compare
69
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Group by device
72
- device_wer = defaultdict(list)
73
- for entry in entries:
74
- device_wer[entry["device"]].append(entry["average_wer"])
75
-
76
- # Calculate statistics for each device
77
- device_stats = {}
78
- for device, wer_values in device_wer.items():
79
- device_stats[device] = calculate_wer_statistics(wer_values)
80
-
81
- # Find significant discrepancies between devices
82
- devices = list(device_stats.keys())
83
- for i in range(len(devices)):
84
- for j in range(i + 1, len(devices)):
85
- device_1, device_2 = devices[i], devices[j]
86
- mean_1 = device_stats[device_1]["mean"]
87
- mean_2 = device_stats[device_2]["mean"]
88
 
89
- # Calculate percentage difference
90
- if mean_1 > 0: # Avoid division by zero
91
- pct_diff = abs(mean_2 - mean_1) / mean_1 * 100
92
-
93
- if pct_diff > threshold:
94
- regressions.append({
95
- "type": "device_discrepancy",
96
- "model": model,
97
- "os": os,
98
- "device_1": device_1,
99
- "device_2": device_2,
100
- "wer_1": round(mean_1, 2),
101
- "wer_2": round(mean_2, 2),
102
- "percentage_diff": round(pct_diff, 1)
103
- })
104
 
105
  return regressions
106
 
107
 
108
- def detect_os_regressions(data: List[Dict], threshold: float = 20.0) -> List[Dict]:
109
  """
110
- Detect WER regressions across different OS versions for each model/device combination.
 
111
  Returns list of regression alerts.
112
  """
113
  regressions = []
114
 
115
- # Group by model and device, then compare across OS versions
116
- model_device_data = defaultdict(list)
117
- for entry in data:
118
- model_device_key = (entry["model"], entry["device"])
119
- model_device_data[model_device_key].append(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- for (model, device), entries in model_device_data.items():
122
- if len(entries) < 2: # Need at least 2 data points to compare
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  continue
124
 
125
- # Group by OS
126
- os_wer = defaultdict(list)
127
- for entry in entries:
128
- os_wer[entry["os"]].append(entry["average_wer"])
129
-
130
- # Calculate statistics for each OS
131
- os_stats = {}
132
- for os, wer_values in os_wer.items():
133
- os_stats[os] = calculate_wer_statistics(wer_values)
134
-
135
- # Find significant discrepancies between OS versions
136
- os_versions = list(os_stats.keys())
137
- for i in range(len(os_versions)):
138
- for j in range(i + 1, len(os_versions)):
139
- os_1, os_2 = os_versions[i], os_versions[j]
140
- mean_1 = os_stats[os_1]["mean"]
141
- mean_2 = os_stats[os_2]["mean"]
142
-
143
- # Calculate percentage difference
144
- if mean_1 > 0: # Avoid division by zero
145
- pct_diff = abs(mean_2 - mean_1) / mean_1 * 100
146
-
147
- if pct_diff > threshold:
148
- regressions.append({
149
- "type": "os_discrepancy",
150
- "model": model,
151
- "device": device,
152
- "os_1": os_1,
153
- "os_2": os_2,
154
- "wer_1": round(mean_1, 2),
155
- "wer_2": round(mean_2, 2),
156
- "percentage_diff": round(pct_diff, 1)
157
- })
158
 
159
  return regressions
160
 
161
 
162
- def detect_release_regressions(current_data: List[Dict], previous_data: List[Dict],
163
- threshold: float = 20.0) -> List[Dict]:
164
  """
165
- Detect WER regressions between WhisperKit releases.
 
166
  Returns list of regression alerts.
167
  """
168
  regressions = []
169
 
170
  if not previous_data:
171
- print("No previous release data available for comparison")
172
  return regressions
173
 
174
- # Create lookup dictionaries by (model, device, os)
175
- current_lookup = {}
176
- previous_lookup = {}
177
 
178
  for entry in current_data:
179
- key = (entry["model"], entry["device"], entry["os"])
180
- current_lookup[key] = entry["average_wer"]
181
 
182
  for entry in previous_data:
183
- key = (entry["model"], entry["device"], entry["os"])
184
- previous_lookup[key] = entry["average_wer"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- # Compare common configurations
187
- common_configs = set(current_lookup.keys()) & set(previous_lookup.keys())
 
 
 
 
 
 
 
 
188
 
189
- for config in common_configs:
190
- model, device, os = config
191
- current_wer = current_lookup[config]
192
- previous_wer = previous_lookup[config]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- if previous_wer > 0: # Avoid division by zero
195
- pct_change = (current_wer - previous_wer) / previous_wer * 100
 
 
 
 
196
 
197
- # Only flag significant WER increases (regressions)
198
- if pct_change > threshold:
199
  regressions.append({
200
- "type": "release_regression",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  "model": model,
202
- "device": device,
203
- "os": os,
204
- "previous_wer": round(previous_wer, 2),
205
- "current_wer": round(current_wer, 2),
206
- "percentage_increase": round(pct_change, 1)
 
 
207
  })
208
 
209
  return regressions
210
 
211
 
212
  def generate_slack_message(regressions: List[Dict]) -> Dict:
213
- """Generate Slack message payload for WER regression alerts."""
214
 
215
  if not regressions:
216
  return None
@@ -220,7 +571,7 @@ def generate_slack_message(regressions: List[Dict]) -> Dict:
220
  "type": "header",
221
  "text": {
222
  "type": "plain_text",
223
- "text": "WhisperKit WER Regression Alert",
224
  "emoji": True
225
  }
226
  },
@@ -229,7 +580,7 @@ def generate_slack_message(regressions: List[Dict]) -> Dict:
229
  "elements": [
230
  {
231
  "type": "mrkdwn",
232
- "text": f"*Detected {len(regressions)} significant WER regression(s)*"
233
  }
234
  ]
235
  },
@@ -237,84 +588,239 @@ def generate_slack_message(regressions: List[Dict]) -> Dict:
237
  ]
238
 
239
  # Group regressions by type
240
- device_regressions = [r for r in regressions if r["type"] == "device_discrepancy"]
241
- os_regressions = [r for r in regressions if r["type"] == "os_discrepancy"]
242
- release_regressions = [r for r in regressions if r["type"] == "release_regression"]
 
 
 
 
243
 
244
- if device_regressions:
 
 
 
 
 
245
  blocks.append({
246
  "type": "section",
247
  "text": {
248
  "type": "mrkdwn",
249
- "text": "*Device Discrepancies:*"
250
  }
251
  })
252
 
253
- for regression in device_regressions:
254
  blocks.append({
255
  "type": "section",
256
  "text": {
257
  "type": "mrkdwn",
258
- "text": f"*{regression['model']}* on {regression['os']}\n"
259
- f"• {regression['device_1']}: {regression['wer_1']}% WER\n"
260
- f"• {regression['device_2']}: {regression['wer_2']}% WER\n"
261
- f"• Difference: {regression['percentage_diff']}%"
262
  }
263
  })
264
 
265
- if os_regressions:
266
- if device_regressions:
267
  blocks.append({"type": "divider"})
268
 
269
  blocks.append({
270
  "type": "section",
271
  "text": {
272
  "type": "mrkdwn",
273
- "text": "*OS Version Discrepancies:*"
274
  }
275
  })
276
 
277
- for regression in os_regressions:
278
  blocks.append({
279
  "type": "section",
280
  "text": {
281
  "type": "mrkdwn",
282
- "text": f"*{regression['model']}* on {regression['device']}\n"
283
- f"• {regression['os_1']}: {regression['wer_1']}% WER\n"
284
- f"• {regression['os_2']}: {regression['wer_2']}% WER\n"
285
- f"• Difference: {regression['percentage_diff']}%"
286
  }
287
  })
288
 
289
- if release_regressions:
290
- if device_regressions or os_regressions:
291
  blocks.append({"type": "divider"})
292
 
293
  blocks.append({
294
  "type": "section",
295
  "text": {
296
  "type": "mrkdwn",
297
- "text": "*Release-to-Release Regressions:*"
298
  }
299
  })
300
 
301
- for regression in release_regressions:
302
  blocks.append({
303
  "type": "section",
304
  "text": {
305
  "type": "mrkdwn",
306
  "text": f"*{regression['model']}* on {regression['device']} ({regression['os']})\n"
307
- f"• Previous: {regression['previous_wer']}% WER\n"
308
- f"• Current: {regression['current_wer']}% WER\n"
309
  f"• Increase: +{regression['percentage_increase']}%"
310
  }
311
  })
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  return {"blocks": blocks}
314
 
315
 
316
- def check_wer_regressions():
317
- """Main function to check for WER regressions and generate alerts."""
318
 
319
  # Load version data to get commit hashes
320
  try:
@@ -333,7 +839,7 @@ def check_wer_regressions():
333
  current_commit = releases[-1] if releases else None
334
  previous_commit = releases[-2] if len(releases) >= 2 else None
335
 
336
- print(f"Checking WER regressions for current commit: {current_commit}")
337
  if previous_commit:
338
  print(f"Comparing against previous commit: {previous_commit}")
339
 
@@ -347,43 +853,70 @@ def check_wer_regressions():
347
 
348
  all_regressions = []
349
 
350
- # Check for device discrepancies across all WhisperKit versions
351
- device_regressions = detect_device_regressions(all_historical_data, threshold=20.0)
 
352
  all_regressions.extend(device_regressions)
353
- print(f"Found {len(device_regressions)} device discrepancies across WhisperKit versions")
354
 
355
- # Check for OS discrepancies across all WhisperKit versions
356
- os_regressions = detect_os_regressions(all_historical_data, threshold=20.0)
357
  all_regressions.extend(os_regressions)
358
- print(f"Found {len(os_regressions)} OS discrepancies across WhisperKit versions")
359
 
360
- # Check for release-to-release regressions
361
  release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0)
362
  all_regressions.extend(release_regressions)
363
- print(f"Found {len(release_regressions)} release regressions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
  # Generate outputs
366
  github_output = os.getenv("GITHUB_OUTPUT")
367
  if github_output:
368
  with open(github_output, "a") as f:
369
- print(f"has_wer_regressions={'true' if all_regressions else 'false'}", file=f)
370
- print(f"wer_regression_count={len(all_regressions)}", file=f)
371
 
372
  if all_regressions:
373
  slack_payload = generate_slack_message(all_regressions)
374
  if slack_payload:
375
- f.write("wer_regression_slack_payload<<EOF\n")
376
  json.dump(slack_payload, f, indent=2)
377
  f.write("\nEOF\n")
378
 
379
  # Print summary for debugging
380
  if all_regressions:
381
- print(f"\nALERT: Found {len(all_regressions)} WER regressions!")
382
  for regression in all_regressions:
383
  print(f" - {regression['type']}: {regression.get('model', 'N/A')}")
384
  else:
385
- print("No significant WER regressions detected")
386
 
387
 
388
  if __name__ == "__main__":
389
- check_wer_regressions()
 
1
  #!/usr/bin/env python3
2
  """
3
+ WhisperKit Performance Regression Detection Script
4
 
5
+ This script detects significant performance regressions per model by:
6
+ - Tracking the best (lowest) WER for each model
7
+ - Tracking the best (highest) speed and tokens per second for each model
8
+ - Comparing all configurations against those best baselines
9
+ - Alerting if any configuration deviates by > 20%
10
 
11
+ If any model shows discrepancy > 20%, it alerts via Slack.
12
  """
13
 
14
  import json
 
52
  }
53
 
54
 
55
+ def detect_device_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
56
  """
57
+ Detect WER regressions for devices in current release.
58
+ Compares current data points against historical best for each model+device combination.
59
  Returns list of regression alerts.
60
  """
61
  regressions = []
62
 
63
+ # Build historical best WER for each model+device combination
64
+ historical_best = {}
65
+ best_configs = {}
66
+ for entry in all_historical_data:
67
+ key = (entry["model"], entry["device"])
68
+ if key not in historical_best:
69
+ historical_best[key] = entry["average_wer"]
70
+ best_configs[key] = entry
71
+ elif entry["average_wer"] < historical_best[key]:
72
+ historical_best[key] = entry["average_wer"]
73
+ best_configs[key] = entry
74
+
75
+ # Check each current data point against historical best
76
+ for entry in current_data:
77
+ key = (entry["model"], entry["device"])
78
+
79
+ if key not in historical_best:
80
+ continue # No historical data for this combination
81
+
82
+ best_wer = historical_best[key]
83
+ best_config = best_configs[key]
84
+ current_wer = entry["average_wer"]
85
+
86
+ if best_wer > 0: # Avoid division by zero
87
+ pct_diff = (current_wer - best_wer) / best_wer * 100
88
+
89
+ # Only flag if current is significantly worse than historical best
90
+ if pct_diff > threshold:
91
+ regressions.append({
92
+ "type": "device_wer_discrepancy",
93
+ "metric": "WER",
94
+ "model": entry["model"],
95
+ "device": entry["device"],
96
+ "os": entry["os"],
97
+ "current_value": round(current_wer, 2),
98
+ "best_value": round(best_wer, 2),
99
+ "best_device": best_config["device"],
100
+ "best_os": best_config["os"],
101
+ "percentage_diff": round(pct_diff, 1)
102
+ })
103
 
104
+ return regressions
105
+
106
+
107
+ def detect_os_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
108
+ """
109
+ Detect WER regressions for OS versions in current release.
110
+ Compares current data points against historical best for each model+OS combination.
111
+ Returns list of regression alerts.
112
+ """
113
+ regressions = []
114
+
115
+ # Build historical best WER for each model+OS combination
116
+ historical_best = {}
117
+ best_configs = {}
118
+ for entry in all_historical_data:
119
+ key = (entry["model"], entry["os"])
120
+ if key not in historical_best:
121
+ historical_best[key] = entry["average_wer"]
122
+ best_configs[key] = entry
123
+ elif entry["average_wer"] < historical_best[key]:
124
+ historical_best[key] = entry["average_wer"]
125
+ best_configs[key] = entry
126
+
127
+ # Check each current data point against historical best
128
+ for entry in current_data:
129
+ key = (entry["model"], entry["os"])
130
+
131
+ if key not in historical_best:
132
+ continue # No historical data for this combination
133
+
134
+ best_wer = historical_best[key]
135
+ best_config = best_configs[key]
136
+ current_wer = entry["average_wer"]
137
+
138
+ if best_wer > 0: # Avoid division by zero
139
+ pct_diff = (current_wer - best_wer) / best_wer * 100
140
+
141
+ # Only flag if current is significantly worse than historical best
142
+ if pct_diff > threshold:
143
+ regressions.append({
144
+ "type": "os_wer_discrepancy",
145
+ "metric": "WER",
146
+ "model": entry["model"],
147
+ "device": entry["device"],
148
+ "os": entry["os"],
149
+ "current_value": round(current_wer, 2),
150
+ "best_value": round(best_wer, 2),
151
+ "best_device": best_config["device"],
152
+ "best_os": best_config["os"],
153
+ "percentage_diff": round(pct_diff, 1)
154
+ })
155
+
156
+ return regressions
157
+
158
+
159
+ def detect_release_regressions(current_data: List[Dict], previous_data: List[Dict],
160
+ threshold: float = 20.0) -> List[Dict]:
161
+ """
162
+ Detect WER regressions in current release for each model.
163
+ Compares current WER against the best (lowest) historical WER for that model.
164
+ Returns list of regression alerts.
165
+ """
166
+ regressions = []
167
+
168
+ if not previous_data:
169
+ print("No previous release data available for comparison")
170
+ return regressions
171
+
172
+ # Combine all historical data
173
+ all_historical = previous_data
174
+
175
+ # Group by model
176
+ model_current = defaultdict(list)
177
+ model_historical = defaultdict(list)
178
+
179
+ for entry in current_data:
180
+ model_current[entry["model"]].append(entry)
181
+
182
+ for entry in all_historical:
183
+ model_historical[entry["model"]].append(entry)
184
+
185
+ # Check each model
186
+ for model in model_current.keys():
187
+ if model not in model_historical:
188
+ continue # No historical data for this model
189
+
190
+ # Find best historical WER for this model
191
+ best_historical_wer = min(entry["average_wer"] for entry in model_historical[model])
192
+ best_config = next(e for e in model_historical[model] if e["average_wer"] == best_historical_wer)
193
+
194
+ # Check each current configuration against best historical
195
+ for current_entry in model_current[model]:
196
+ current_wer = current_entry["average_wer"]
197
 
198
+ if best_historical_wer > 0: # Avoid division by zero
199
+ pct_change = (current_wer - best_historical_wer) / best_historical_wer * 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ # Only flag significant WER increases (regressions)
202
+ if pct_change > threshold:
203
+ regressions.append({
204
+ "type": "release_wer_regression",
205
+ "metric": "WER",
206
+ "model": model,
207
+ "device": current_entry["device"],
208
+ "os": current_entry["os"],
209
+ "current_value": round(current_wer, 2),
210
+ "best_historical_value": round(best_historical_wer, 2),
211
+ "best_device": best_config["device"],
212
+ "best_os": best_config["os"],
213
+ "percentage_increase": round(pct_change, 1)
214
+ })
 
215
 
216
  return regressions
217
 
218
 
219
+ def detect_speed_device_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
220
  """
221
+ Detect speed regressions for devices in current release.
222
+ Compares current data points against historical best for each model+device combination.
223
  Returns list of regression alerts.
224
  """
225
  regressions = []
226
 
227
+ # Build historical best speed for each model+device combination
228
+ historical_best = {}
229
+ best_configs = {}
230
+ for entry in all_historical_data:
231
+ if "speed" not in entry:
232
+ continue
233
+ key = (entry["model"], entry["device"])
234
+ if key not in historical_best:
235
+ historical_best[key] = entry["speed"]
236
+ best_configs[key] = entry
237
+ elif entry["speed"] > historical_best[key]:
238
+ historical_best[key] = entry["speed"]
239
+ best_configs[key] = entry
240
+
241
+ # Check each current data point against historical best
242
+ for entry in current_data:
243
+ if "speed" not in entry:
244
+ continue
245
+
246
+ key = (entry["model"], entry["device"])
247
+
248
+ if key not in historical_best:
249
+ continue # No historical data for this combination
250
+
251
+ best_speed = historical_best[key]
252
+ best_config = best_configs[key]
253
+ current_speed = entry["speed"]
254
+
255
+ if best_speed > 0: # Avoid division by zero
256
+ pct_diff = (best_speed - current_speed) / best_speed * 100
257
+
258
+ # Only flag if current is significantly slower than historical best
259
+ if pct_diff > threshold:
260
+ regressions.append({
261
+ "type": "device_speed_discrepancy",
262
+ "metric": "Speed",
263
+ "model": entry["model"],
264
+ "device": entry["device"],
265
+ "os": entry["os"],
266
+ "current_value": round(current_speed, 2),
267
+ "best_value": round(best_speed, 2),
268
+ "best_device": best_config["device"],
269
+ "best_os": best_config["os"],
270
+ "percentage_diff": round(pct_diff, 1)
271
+ })
272
+
273
+ return regressions
274
+
275
+
276
+ def detect_speed_os_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
277
+ """
278
+ Detect speed regressions for OS versions in current release.
279
+ Compares current data points against historical best for each model+OS combination.
280
+ Returns list of regression alerts.
281
+ """
282
+ regressions = []
283
 
284
+ # Build historical best speed for each model+OS combination
285
+ historical_best = {}
286
+ best_configs = {}
287
+ for entry in all_historical_data:
288
+ if "speed" not in entry:
289
+ continue
290
+ key = (entry["model"], entry["os"])
291
+ if key not in historical_best:
292
+ historical_best[key] = entry["speed"]
293
+ best_configs[key] = entry
294
+ elif entry["speed"] > historical_best[key]:
295
+ historical_best[key] = entry["speed"]
296
+ best_configs[key] = entry
297
+
298
+ # Check each current data point against historical best
299
+ for entry in current_data:
300
+ if "speed" not in entry:
301
  continue
302
 
303
+ key = (entry["model"], entry["os"])
304
+
305
+ if key not in historical_best:
306
+ continue # No historical data for this combination
307
+
308
+ best_speed = historical_best[key]
309
+ best_config = best_configs[key]
310
+ current_speed = entry["speed"]
311
+
312
+ if best_speed > 0: # Avoid division by zero
313
+ pct_diff = (best_speed - current_speed) / best_speed * 100
314
+
315
+ # Only flag if current is significantly slower than historical best
316
+ if pct_diff > threshold:
317
+ regressions.append({
318
+ "type": "os_speed_discrepancy",
319
+ "metric": "Speed",
320
+ "model": entry["model"],
321
+ "device": entry["device"],
322
+ "os": entry["os"],
323
+ "current_value": round(current_speed, 2),
324
+ "best_value": round(best_speed, 2),
325
+ "best_device": best_config["device"],
326
+ "best_os": best_config["os"],
327
+ "percentage_diff": round(pct_diff, 1)
328
+ })
 
 
 
 
 
 
 
329
 
330
  return regressions
331
 
332
 
333
+ def detect_speed_release_regressions(current_data: List[Dict], previous_data: List[Dict],
334
+ threshold: float = 20.0) -> List[Dict]:
335
  """
336
+ Detect speed regressions in current release for each model.
337
+ Compares current speed against the best (highest) historical speed for that model.
338
  Returns list of regression alerts.
339
  """
340
  regressions = []
341
 
342
  if not previous_data:
 
343
  return regressions
344
 
345
+ # Group by model
346
+ model_current = defaultdict(list)
347
+ model_historical = defaultdict(list)
348
 
349
  for entry in current_data:
350
+ if "speed" in entry:
351
+ model_current[entry["model"]].append(entry)
352
 
353
  for entry in previous_data:
354
+ if "speed" in entry:
355
+ model_historical[entry["model"]].append(entry)
356
+
357
+ # Check each model
358
+ for model in model_current.keys():
359
+ if model not in model_historical:
360
+ continue # No historical data for this model
361
+
362
+ # Find best historical speed for this model
363
+ best_historical_speed = max(entry["speed"] for entry in model_historical[model])
364
+ best_config = next(e for e in model_historical[model] if e["speed"] == best_historical_speed)
365
+
366
+ # Check each current configuration against best historical
367
+ for current_entry in model_current[model]:
368
+ current_speed = current_entry["speed"]
369
+
370
+ if best_historical_speed > 0: # Avoid division by zero
371
+ pct_change = (best_historical_speed - current_speed) / best_historical_speed * 100
372
+
373
+ # Only flag significant speed decreases (regressions)
374
+ if pct_change > threshold:
375
+ regressions.append({
376
+ "type": "release_speed_regression",
377
+ "metric": "Speed",
378
+ "model": model,
379
+ "device": current_entry["device"],
380
+ "os": current_entry["os"],
381
+ "current_value": round(current_speed, 2),
382
+ "best_historical_value": round(best_historical_speed, 2),
383
+ "best_device": best_config["device"],
384
+ "best_os": best_config["os"],
385
+ "percentage_decrease": round(pct_change, 1)
386
+ })
387
+
388
+ return regressions
389
+
390
+
391
+ def detect_tokens_device_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
392
+ """
393
+ Detect tokens per second regressions for devices in current release.
394
+ Compares current data points against historical best for each model+device combination.
395
+ Returns list of regression alerts.
396
+ """
397
+ regressions = []
398
+
399
+ # Build historical best tokens/sec for each model+device combination
400
+ historical_best = {}
401
+ best_configs = {}
402
+ for entry in all_historical_data:
403
+ if "tokens_per_second" not in entry:
404
+ continue
405
+ key = (entry["model"], entry["device"])
406
+ if key not in historical_best:
407
+ historical_best[key] = entry["tokens_per_second"]
408
+ best_configs[key] = entry
409
+ elif entry["tokens_per_second"] > historical_best[key]:
410
+ historical_best[key] = entry["tokens_per_second"]
411
+ best_configs[key] = entry
412
+
413
+ # Check each current data point against historical best
414
+ for entry in current_data:
415
+ if "tokens_per_second" not in entry:
416
+ continue
417
+
418
+ key = (entry["model"], entry["device"])
419
+
420
+ if key not in historical_best:
421
+ continue # No historical data for this combination
422
+
423
+ best_tokens = historical_best[key]
424
+ best_config = best_configs[key]
425
+ current_tokens = entry["tokens_per_second"]
426
+
427
+ if best_tokens > 0: # Avoid division by zero
428
+ pct_diff = (best_tokens - current_tokens) / best_tokens * 100
429
+
430
+ # Only flag if current is significantly slower than historical best
431
+ if pct_diff > threshold:
432
+ regressions.append({
433
+ "type": "device_tokens_discrepancy",
434
+ "metric": "Tokens/Second",
435
+ "model": entry["model"],
436
+ "device": entry["device"],
437
+ "os": entry["os"],
438
+ "current_value": round(current_tokens, 2),
439
+ "best_value": round(best_tokens, 2),
440
+ "best_device": best_config["device"],
441
+ "best_os": best_config["os"],
442
+ "percentage_diff": round(pct_diff, 1)
443
+ })
444
 
445
+ return regressions
446
+
447
+
448
+ def detect_tokens_os_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
449
+ """
450
+ Detect tokens per second regressions for OS versions in current release.
451
+ Compares current data points against historical best for each model+OS combination.
452
+ Returns list of regression alerts.
453
+ """
454
+ regressions = []
455
 
456
+ # Build historical best tokens/sec for each model+OS combination
457
+ historical_best = {}
458
+ best_configs = {}
459
+ for entry in all_historical_data:
460
+ if "tokens_per_second" not in entry:
461
+ continue
462
+ key = (entry["model"], entry["os"])
463
+ if key not in historical_best:
464
+ historical_best[key] = entry["tokens_per_second"]
465
+ best_configs[key] = entry
466
+ elif entry["tokens_per_second"] > historical_best[key]:
467
+ historical_best[key] = entry["tokens_per_second"]
468
+ best_configs[key] = entry
469
+
470
+ # Check each current data point against historical best
471
+ for entry in current_data:
472
+ if "tokens_per_second" not in entry:
473
+ continue
474
+
475
+ key = (entry["model"], entry["os"])
476
+
477
+ if key not in historical_best:
478
+ continue # No historical data for this combination
479
 
480
+ best_tokens = historical_best[key]
481
+ best_config = best_configs[key]
482
+ current_tokens = entry["tokens_per_second"]
483
+
484
+ if best_tokens > 0: # Avoid division by zero
485
+ pct_diff = (best_tokens - current_tokens) / best_tokens * 100
486
 
487
+ # Only flag if current is significantly slower than historical best
488
+ if pct_diff > threshold:
489
  regressions.append({
490
+ "type": "os_tokens_discrepancy",
491
+ "metric": "Tokens/Second",
492
+ "model": entry["model"],
493
+ "device": entry["device"],
494
+ "os": entry["os"],
495
+ "current_value": round(current_tokens, 2),
496
+ "best_value": round(best_tokens, 2),
497
+ "best_device": best_config["device"],
498
+ "best_os": best_config["os"],
499
+ "percentage_diff": round(pct_diff, 1)
500
+ })
501
+
502
+ return regressions
503
+
504
+
505
+ def detect_tokens_release_regressions(current_data: List[Dict], previous_data: List[Dict],
506
+ threshold: float = 20.0) -> List[Dict]:
507
+ """
508
+ Detect tokens per second regressions in current release for each model.
509
+ Compares current tokens/sec against the best (highest) historical tokens/sec for that model.
510
+ Returns list of regression alerts.
511
+ """
512
+ regressions = []
513
+
514
+ if not previous_data:
515
+ return regressions
516
+
517
+ # Group by model
518
+ model_current = defaultdict(list)
519
+ model_historical = defaultdict(list)
520
+
521
+ for entry in current_data:
522
+ if "tokens_per_second" in entry:
523
+ model_current[entry["model"]].append(entry)
524
+
525
+ for entry in previous_data:
526
+ if "tokens_per_second" in entry:
527
+ model_historical[entry["model"]].append(entry)
528
+
529
+ # Check each model
530
+ for model in model_current.keys():
531
+ if model not in model_historical:
532
+ continue # No historical data for this model
533
+
534
+ # Find best historical tokens/sec for this model
535
+ best_historical_tokens = max(entry["tokens_per_second"] for entry in model_historical[model])
536
+ best_config = next(e for e in model_historical[model] if e["tokens_per_second"] == best_historical_tokens)
537
+
538
+ # Check each current configuration against best historical
539
+ for current_entry in model_current[model]:
540
+ current_tokens = current_entry["tokens_per_second"]
541
+
542
+ if best_historical_tokens > 0: # Avoid division by zero
543
+ pct_change = (best_historical_tokens - current_tokens) / best_historical_tokens * 100
544
+
545
+ # Only flag significant tokens/sec decreases (regressions)
546
+ if pct_change > threshold:
547
+ regressions.append({
548
+ "type": "release_tokens_regression",
549
+ "metric": "Tokens/Second",
550
  "model": model,
551
+ "device": current_entry["device"],
552
+ "os": current_entry["os"],
553
+ "current_value": round(current_tokens, 2),
554
+ "best_historical_value": round(best_historical_tokens, 2),
555
+ "best_device": best_config["device"],
556
+ "best_os": best_config["os"],
557
+ "percentage_decrease": round(pct_change, 1)
558
  })
559
 
560
  return regressions
561
 
562
 
563
  def generate_slack_message(regressions: List[Dict]) -> Dict:
564
+ """Generate Slack message payload for performance regression alerts."""
565
 
566
  if not regressions:
567
  return None
 
571
  "type": "header",
572
  "text": {
573
  "type": "plain_text",
574
+ "text": "⚠️ WhisperKit Performance Regression Alert",
575
  "emoji": True
576
  }
577
  },
 
580
  "elements": [
581
  {
582
  "type": "mrkdwn",
583
+ "text": f"*Detected {len(regressions)} significant performance regression(s)*"
584
  }
585
  ]
586
  },
 
588
  ]
589
 
590
  # Group regressions by type
591
+ wer_device = [r for r in regressions if r["type"] == "device_wer_discrepancy"]
592
+ wer_os = [r for r in regressions if r["type"] == "os_wer_discrepancy"]
593
+ wer_release = [r for r in regressions if r["type"] == "release_wer_regression"]
594
+
595
+ speed_device = [r for r in regressions if r["type"] == "device_speed_discrepancy"]
596
+ speed_os = [r for r in regressions if r["type"] == "os_speed_discrepancy"]
597
+ speed_release = [r for r in regressions if r["type"] == "release_speed_regression"]
598
 
599
+ tokens_device = [r for r in regressions if r["type"] == "device_tokens_discrepancy"]
600
+ tokens_os = [r for r in regressions if r["type"] == "os_tokens_discrepancy"]
601
+ tokens_release = [r for r in regressions if r["type"] == "release_tokens_regression"]
602
+
603
+ # WER Regressions
604
+ if wer_device:
605
  blocks.append({
606
  "type": "section",
607
  "text": {
608
  "type": "mrkdwn",
609
+ "text": "*WER Device Discrepancies:*"
610
  }
611
  })
612
 
613
+ for regression in wer_device:
614
  blocks.append({
615
  "type": "section",
616
  "text": {
617
  "type": "mrkdwn",
618
+ "text": f"*{regression['model']}*\n"
619
+ f"• {regression['device']}: {regression['current_value']}% WER\n"
620
+ f"• Best: {regression['best_value']}% WER ({regression['best_device']} on {regression['best_os']})\n"
621
+ f"• Deviation: +{regression['percentage_diff']}%"
622
  }
623
  })
624
 
625
+ if wer_os:
626
+ if wer_device:
627
  blocks.append({"type": "divider"})
628
 
629
  blocks.append({
630
  "type": "section",
631
  "text": {
632
  "type": "mrkdwn",
633
+ "text": "*WER OS Version Discrepancies:*"
634
  }
635
  })
636
 
637
+ for regression in wer_os:
638
  blocks.append({
639
  "type": "section",
640
  "text": {
641
  "type": "mrkdwn",
642
+ "text": f"*{regression['model']}*\n"
643
+ f"• {regression['os']}: {regression['current_value']}% WER\n"
644
+ f"• Best: {regression['best_value']}% WER ({regression['best_device']} on {regression['best_os']})\n"
645
+ f"• Deviation: +{regression['percentage_diff']}%"
646
  }
647
  })
648
 
649
+ if wer_release:
650
+ if wer_device or wer_os:
651
  blocks.append({"type": "divider"})
652
 
653
  blocks.append({
654
  "type": "section",
655
  "text": {
656
  "type": "mrkdwn",
657
+ "text": "*WER Release-to-Release Regressions:*"
658
  }
659
  })
660
 
661
+ for regression in wer_release:
662
  blocks.append({
663
  "type": "section",
664
  "text": {
665
  "type": "mrkdwn",
666
  "text": f"*{regression['model']}* on {regression['device']} ({regression['os']})\n"
667
+ f"• Current: {regression['current_value']}% WER\n"
668
+ f"• Best Historical: {regression['best_historical_value']}% WER ({regression['best_device']} on {regression['best_os']})\n"
669
  f"• Increase: +{regression['percentage_increase']}%"
670
  }
671
  })
672
 
673
+ # Speed Regressions
674
+ if speed_device:
675
+ if wer_device or wer_os or wer_release:
676
+ blocks.append({"type": "divider"})
677
+
678
+ blocks.append({
679
+ "type": "section",
680
+ "text": {
681
+ "type": "mrkdwn",
682
+ "text": "*Speed Device Discrepancies:*"
683
+ }
684
+ })
685
+
686
+ for regression in speed_device:
687
+ blocks.append({
688
+ "type": "section",
689
+ "text": {
690
+ "type": "mrkdwn",
691
+ "text": f"*{regression['model']}*\n"
692
+ f"• {regression['device']}: {regression['current_value']}x speed\n"
693
+ f"• Best: {regression['best_value']}x speed ({regression['best_device']} on {regression['best_os']})\n"
694
+ f"• Slower by: {regression['percentage_diff']}%"
695
+ }
696
+ })
697
+
698
+ if speed_os:
699
+ if any([wer_device, wer_os, wer_release, speed_device]):
700
+ blocks.append({"type": "divider"})
701
+
702
+ blocks.append({
703
+ "type": "section",
704
+ "text": {
705
+ "type": "mrkdwn",
706
+ "text": "*Speed OS Version Discrepancies:*"
707
+ }
708
+ })
709
+
710
+ for regression in speed_os:
711
+ blocks.append({
712
+ "type": "section",
713
+ "text": {
714
+ "type": "mrkdwn",
715
+ "text": f"*{regression['model']}*\n"
716
+ f"• {regression['os']}: {regression['current_value']}x speed\n"
717
+ f"• Best: {regression['best_value']}x speed ({regression['best_device']} on {regression['best_os']})\n"
718
+ f"• Slower by: {regression['percentage_diff']}%"
719
+ }
720
+ })
721
+
722
+ if speed_release:
723
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os]):
724
+ blocks.append({"type": "divider"})
725
+
726
+ blocks.append({
727
+ "type": "section",
728
+ "text": {
729
+ "type": "mrkdwn",
730
+ "text": "*Speed Release-to-Release Regressions:*"
731
+ }
732
+ })
733
+
734
+ for regression in speed_release:
735
+ blocks.append({
736
+ "type": "section",
737
+ "text": {
738
+ "type": "mrkdwn",
739
+ "text": f"*{regression['model']}* on {regression['device']} ({regression['os']})\n"
740
+ f"• Current: {regression['current_value']}x speed\n"
741
+ f"• Best Historical: {regression['best_historical_value']}x speed ({regression['best_device']} on {regression['best_os']})\n"
742
+ f"• Slower by: {regression.get('percentage_decrease', regression.get('percentage_increase', 0))}%"
743
+ }
744
+ })
745
+
746
+ # Tokens Per Second Regressions
747
+ if tokens_device:
748
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os, speed_release]):
749
+ blocks.append({"type": "divider"})
750
+
751
+ blocks.append({
752
+ "type": "section",
753
+ "text": {
754
+ "type": "mrkdwn",
755
+ "text": "*Tokens/Second Device Discrepancies:*"
756
+ }
757
+ })
758
+
759
+ for regression in tokens_device:
760
+ blocks.append({
761
+ "type": "section",
762
+ "text": {
763
+ "type": "mrkdwn",
764
+ "text": f"*{regression['model']}*\n"
765
+ f"• {regression['device']}: {regression['current_value']} tokens/sec\n"
766
+ f"• Best: {regression['best_value']} tokens/sec ({regression['best_device']} on {regression['best_os']})\n"
767
+ f"• Slower by: {regression['percentage_diff']}%"
768
+ }
769
+ })
770
+
771
+ if tokens_os:
772
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os, speed_release, tokens_device]):
773
+ blocks.append({"type": "divider"})
774
+
775
+ blocks.append({
776
+ "type": "section",
777
+ "text": {
778
+ "type": "mrkdwn",
779
+ "text": "*Tokens/Second OS Version Discrepancies:*"
780
+ }
781
+ })
782
+
783
+ for regression in tokens_os:
784
+ blocks.append({
785
+ "type": "section",
786
+ "text": {
787
+ "type": "mrkdwn",
788
+ "text": f"*{regression['model']}*\n"
789
+ f"• {regression['os']}: {regression['current_value']} tokens/sec\n"
790
+ f"• Best: {regression['best_value']} tokens/sec ({regression['best_device']} on {regression['best_os']})\n"
791
+ f"• Slower by: {regression['percentage_diff']}%"
792
+ }
793
+ })
794
+
795
+ if tokens_release:
796
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os, speed_release, tokens_device, tokens_os]):
797
+ blocks.append({"type": "divider"})
798
+
799
+ blocks.append({
800
+ "type": "section",
801
+ "text": {
802
+ "type": "mrkdwn",
803
+ "text": "*Tokens/Second Release-to-Release Regressions:*"
804
+ }
805
+ })
806
+
807
+ for regression in tokens_release:
808
+ blocks.append({
809
+ "type": "section",
810
+ "text": {
811
+ "type": "mrkdwn",
812
+ "text": f"*{regression['model']}* on {regression['device']} ({regression['os']})\n"
813
+ f"• Current: {regression['current_value']} tokens/sec\n"
814
+ f"• Best Historical: {regression['best_historical_value']} tokens/sec ({regression['best_device']} on {regression['best_os']})\n"
815
+ f"• Slower by: {regression.get('percentage_decrease', regression.get('percentage_increase', 0))}%"
816
+ }
817
+ })
818
+
819
  return {"blocks": blocks}
820
 
821
 
822
+ def check_performance_regressions():
823
+ """Main function to check for performance regressions and generate alerts."""
824
 
825
  # Load version data to get commit hashes
826
  try:
 
839
  current_commit = releases[-1] if releases else None
840
  previous_commit = releases[-2] if len(releases) >= 2 else None
841
 
842
+ print(f"Checking performance regressions for current commit: {current_commit}")
843
  if previous_commit:
844
  print(f"Comparing against previous commit: {previous_commit}")
845
 
 
853
 
854
  all_regressions = []
855
 
856
+ # WER Checks
857
+ print("\n=== Checking WER Regressions ===")
858
+ device_regressions = detect_device_regressions(current_data, all_historical_data, threshold=20.0)
859
  all_regressions.extend(device_regressions)
860
+ print(f"Found {len(device_regressions)} WER device discrepancies")
861
 
862
+ os_regressions = detect_os_regressions(current_data, all_historical_data, threshold=20.0)
 
863
  all_regressions.extend(os_regressions)
864
+ print(f"Found {len(os_regressions)} WER OS discrepancies")
865
 
 
866
  release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0)
867
  all_regressions.extend(release_regressions)
868
+ print(f"Found {len(release_regressions)} WER release regressions")
869
+
870
+ # Speed Checks
871
+ print("\n=== Checking Speed Regressions ===")
872
+ speed_device_regressions = detect_speed_device_regressions(current_data, all_historical_data, threshold=20.0)
873
+ all_regressions.extend(speed_device_regressions)
874
+ print(f"Found {len(speed_device_regressions)} speed device discrepancies")
875
+
876
+ speed_os_regressions = detect_speed_os_regressions(current_data, all_historical_data, threshold=20.0)
877
+ all_regressions.extend(speed_os_regressions)
878
+ print(f"Found {len(speed_os_regressions)} speed OS discrepancies")
879
+
880
+ speed_release_regressions = detect_speed_release_regressions(current_data, previous_data, threshold=20.0)
881
+ all_regressions.extend(speed_release_regressions)
882
+ print(f"Found {len(speed_release_regressions)} speed release regressions")
883
+
884
+ # Tokens Per Second Checks
885
+ print("\n=== Checking Tokens/Second Regressions ===")
886
+ tokens_device_regressions = detect_tokens_device_regressions(current_data, all_historical_data, threshold=20.0)
887
+ all_regressions.extend(tokens_device_regressions)
888
+ print(f"Found {len(tokens_device_regressions)} tokens/sec device discrepancies")
889
+
890
+ tokens_os_regressions = detect_tokens_os_regressions(current_data, all_historical_data, threshold=20.0)
891
+ all_regressions.extend(tokens_os_regressions)
892
+ print(f"Found {len(tokens_os_regressions)} tokens/sec OS discrepancies")
893
+
894
+ tokens_release_regressions = detect_tokens_release_regressions(current_data, previous_data, threshold=20.0)
895
+ all_regressions.extend(tokens_release_regressions)
896
+ print(f"Found {len(tokens_release_regressions)} tokens/sec release regressions")
897
 
898
  # Generate outputs
899
  github_output = os.getenv("GITHUB_OUTPUT")
900
  if github_output:
901
  with open(github_output, "a") as f:
902
+ print(f"has_performance_regressions={'true' if all_regressions else 'false'}", file=f)
903
+ print(f"performance_regression_count={len(all_regressions)}", file=f)
904
 
905
  if all_regressions:
906
  slack_payload = generate_slack_message(all_regressions)
907
  if slack_payload:
908
+ f.write("performance_regression_slack_payload<<EOF\n")
909
  json.dump(slack_payload, f, indent=2)
910
  f.write("\nEOF\n")
911
 
912
  # Print summary for debugging
913
  if all_regressions:
914
+ print(f"\n⚠️ ALERT: Found {len(all_regressions)} performance regressions!")
915
  for regression in all_regressions:
916
  print(f" - {regression['type']}: {regression.get('model', 'N/A')}")
917
  else:
918
+ print("\n✅ No significant performance regressions detected")
919
 
920
 
921
  if __name__ == "__main__":
922
+ check_performance_regressions()