#!/usr/bin/env python3 """ Test script for WER regression detection Tests all regression detection functions with synthetic and real data """ import json import sys from wer_regression_check import ( detect_device_regressions, detect_os_regressions, detect_release_regressions, detect_speed_device_regressions, detect_speed_os_regressions, detect_speed_release_regressions, detect_tokens_device_regressions, detect_tokens_os_regressions, detect_tokens_release_regressions, generate_slack_message, load_performance_data ) def test_wer_detection_with_synthetic_data(): """Test WER detection with known synthetic data""" print("\n" + "="*80) print("TEST 1: WER Detection with Synthetic Data") print("="*80) # Create synthetic data where we know there should be regressions # Historical data (best performances) historical_data = [ # Model A: iPhone has best WER of 10% {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0}, {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0}, {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.2, "speed": 9.8, "tokens_per_second": 49.0}, # Model B: iOS 17 has best WER of 10% {"model": "model-b", "device": "iPhone 15", "os": "iOS 17", "average_wer": 10.0, "speed": 20.0, "tokens_per_second": 100.0}, {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 19.0, "tokens_per_second": 95.0}, # Model C: No regression scenario {"model": "model-c", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0}, {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0}, ] # Current data (latest release with regressions) current_data = [ # Model A: iPad Pro has regressed to 15% WER (50% worse than best 10%) {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 15.0, "speed": 8.0, "tokens_per_second": 40.0}, {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.3, "speed": 9.7, "tokens_per_second": 49.5}, # Model B: iOS 18 has regressed to 13% WER (30% worse than best 10%) {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 13.0, "speed": 15.0, "tokens_per_second": 75.0}, # Model C: Still within 20% (11% vs best 10%) {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 9.0, "tokens_per_second": 45.0}, ] # Test device regressions device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0) print(f"\n✓ Device WER Regressions Found: {len(device_regressions)}") # Debug: print all found regressions for r in device_regressions: print(f" - {r['model']}: {r['device']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)") # Model A should trigger (iPad Pro is ~40% worse than iPhone) # Model C should NOT trigger (iPad Pro is only 10% worse) assert len(device_regressions) >= 1, f"Expected at least 1 device regression, got {len(device_regressions)}" # Verify model-a is in the regressions model_a_regressions = [r for r in device_regressions if r["model"] == "model-a"] assert len(model_a_regressions) > 0, "Expected model-a to have device regression" print(f"\n✓ Model-a correctly flagged for device regression") # Test OS regressions os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0) print(f"\n✓ OS WER Regressions Found: {len(os_regressions)}") # Debug: print all found OS regressions for r in os_regressions: print(f" - {r['model']}: {r['os']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)") assert len(os_regressions) >= 1, f"Expected at least 1 OS regression, got {len(os_regressions)}" # Verify model-b is in the regressions model_b_regressions = [r for r in os_regressions if r["model"] == "model-b"] assert len(model_b_regressions) > 0, "Expected model-b to have OS regression" print(f"\n✓ Model-b correctly flagged for OS regression") print("\n✅ TEST 1 PASSED: WER detection works correctly with synthetic data") return True def test_speed_detection_with_synthetic_data(): """Test speed detection with known synthetic data""" print("\n" + "="*80) print("TEST 2: Speed Detection with Synthetic Data") print("="*80) # Historical data (best performances) historical_data = [ # Model A: iPhone has best speed of 100 {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 200.0}, {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 190.0}, {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 98.0, "tokens_per_second": 195.0}, ] # Current data (with speed regression) current_data = [ # Model A: iPad Pro has regressed to 60 speed (40% slower than best 100) {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 60.0, "tokens_per_second": 120.0}, {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.2, "speed": 97.0, "tokens_per_second": 195.0}, ] # Test device speed regressions speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0) print(f"\n✓ Device Speed Regressions Found: {len(speed_device_regressions)}") assert len(speed_device_regressions) == 1, f"Expected 1 speed device regression, got {len(speed_device_regressions)}" print(f" - {speed_device_regressions[0]['model']}: {speed_device_regressions[0]['device']} has {speed_device_regressions[0]['current_value']}x speed vs best {speed_device_regressions[0]['best_value']}x") print("\n✅ TEST 2 PASSED: Speed detection works correctly with synthetic data") return True def test_tokens_detection_with_synthetic_data(): """Test tokens per second detection with known synthetic data""" print("\n" + "="*80) print("TEST 3: Tokens/Second Detection with Synthetic Data") print("="*80) # Historical data (best performances) historical_data = [ # Model A: iPhone has best tokens/sec of 500 {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0}, {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 98.0, "tokens_per_second": 490.0}, ] # Current data (with tokens/sec regression) current_data = [ # Model A: iPad Pro has regressed to 300 tokens/sec (40% slower than best 500) {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 80.0, "tokens_per_second": 300.0}, {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.1, "speed": 99.0, "tokens_per_second": 495.0}, ] # Test device tokens regressions tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0) print(f"\n✓ Device Tokens/Sec Regressions Found: {len(tokens_device_regressions)}") assert len(tokens_device_regressions) == 1, f"Expected 1 tokens device regression, got {len(tokens_device_regressions)}" print(f" - {tokens_device_regressions[0]['model']}: {tokens_device_regressions[0]['device']} has {tokens_device_regressions[0]['current_value']} tokens/sec vs best {tokens_device_regressions[0]['best_value']}") print("\n✅ TEST 3 PASSED: Tokens/sec detection works correctly with synthetic data") return True def test_release_regression_detection(): """Test release-to-release regression detection""" print("\n" + "="*80) print("TEST 4: Release-to-Release Regression Detection") print("="*80) # Previous release data (best performance) previous_data = [ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0}, {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 490.0}, ] # Current release data (degraded performance - 50% worse) current_data = [ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 15.0, "speed": 60.0, "tokens_per_second": 300.0}, ] # Test WER release regression wer_release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0) print(f"\n✓ WER Release Regressions Found: {len(wer_release_regressions)}") assert len(wer_release_regressions) == 1, f"Expected 1 WER release regression, got {len(wer_release_regressions)}" print(f" - {wer_release_regressions[0]['model']}: WER increased from {wer_release_regressions[0]['best_historical_value']}% to {wer_release_regressions[0]['current_value']}%") # Test speed release regression speed_release_regressions = detect_speed_release_regressions(current_data, previous_data, threshold=20.0) print(f"\n✓ Speed Release Regressions Found: {len(speed_release_regressions)}") assert len(speed_release_regressions) == 1, f"Expected 1 speed release regression, got {len(speed_release_regressions)}" print(f" - {speed_release_regressions[0]['model']}: Speed decreased from {speed_release_regressions[0]['best_historical_value']}x to {speed_release_regressions[0]['current_value']}x") # Test tokens release regression tokens_release_regressions = detect_tokens_release_regressions(current_data, previous_data, threshold=20.0) print(f"\n✓ Tokens/Sec Release Regressions Found: {len(tokens_release_regressions)}") assert len(tokens_release_regressions) == 1, f"Expected 1 tokens release regression, got {len(tokens_release_regressions)}" print(f" - {tokens_release_regressions[0]['model']}: Tokens/sec decreased from {tokens_release_regressions[0]['best_historical_value']} to {tokens_release_regressions[0]['current_value']}") print("\n✅ TEST 4 PASSED: Release-to-release regression detection works correctly") return True def test_slack_message_generation(): """Test Slack message generation""" print("\n" + "="*80) print("TEST 5: Slack Message Generation") print("="*80) # Create sample regressions sample_regressions = [ { "type": "device_wer_discrepancy", "metric": "WER", "model": "test-model", "device": "iPad Pro", "current_value": 35.0, "best_value": 25.0, "best_device": "iPhone 15", "best_os": "iOS 18", "percentage_diff": 40.0 }, { "type": "device_speed_discrepancy", "metric": "Speed", "model": "test-model", "device": "iPad Pro", "current_value": 60.0, "best_value": 100.0, "best_device": "iPhone 15", "best_os": "iOS 18", "percentage_diff": 40.0 } ] # Generate Slack message slack_payload = generate_slack_message(sample_regressions) assert slack_payload is not None, "Expected Slack payload to be generated" assert "blocks" in slack_payload, "Expected 'blocks' in Slack payload" assert len(slack_payload["blocks"]) > 0, "Expected at least one block in Slack payload" print(f"\n✓ Slack Message Generated Successfully") print(f" - Total blocks: {len(slack_payload['blocks'])}") print(f"\n📧 Full Slack Message Payload:") print("=" * 80) print(json.dumps(slack_payload, indent=2)) print("=" * 80) print("\n✅ TEST 5 PASSED: Slack message generation works correctly") return True def test_edge_cases(): """Test edge cases""" print("\n" + "="*80) print("TEST 6: Edge Cases") print("="*80) # Test with single data point (should not trigger any regressions - no historical comparison) single_current = [ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0}, ] empty_historical = [] device_regressions = detect_device_regressions(single_current, empty_historical, threshold=20.0) assert len(device_regressions) == 0, f"Expected 0 regressions with no historical data, got {len(device_regressions)}" print("✓ Single data point with no historical data handled correctly (no regressions)") # Test with empty current data empty_regressions = detect_device_regressions([], single_current, threshold=20.0) assert len(empty_regressions) == 0, "Expected 0 regressions with empty current data" print("✓ Empty current data handled correctly") # Test with missing fields (tokens_per_second missing) partial_historical = [ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0}, {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0}, ] partial_current = [ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 30.0, "speed": 80.0}, ] # Should still work for WER and speed device_regressions = detect_device_regressions(partial_current, partial_historical, threshold=20.0) print(f"✓ Partial data (missing tokens) handled correctly: {len(device_regressions)} WER regressions found") # Should not crash for tokens tokens_regressions = detect_tokens_device_regressions(partial_current, partial_historical, threshold=20.0) assert len(tokens_regressions) == 0, "Expected 0 tokens regressions when field is missing" print("✓ Missing tokens_per_second field handled gracefully") print("\n✅ TEST 6 PASSED: Edge cases handled correctly") return True def test_with_real_data_sample(): """Test with a small sample of real data to verify calculations""" print("\n" + "="*80) print("TEST 7: Real Data Sample Verification") print("="*80) try: # Load a sample of real data real_data = load_performance_data("dashboard_data/performance_data.json") if len(real_data) == 0: print("⚠️ No real data found, skipping this test") return True print(f"✓ Loaded {len(real_data)} real data points") # Get unique models models = set(entry["model"] for entry in real_data) print(f"✓ Found {len(models)} unique models") # Split into current (last 10%) and historical (all data) for testing split_point = int(len(real_data) * 0.9) historical_data = real_data[:split_point] if split_point > 0 else real_data current_data = real_data[split_point:] if split_point > 0 else real_data[:10] # Run detection on real data device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0) os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0) speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0) tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0) print(f"\n✓ Real Data Analysis:") print(f" - WER device regressions: {len(device_regressions)}") print(f" - WER OS regressions: {len(os_regressions)}") print(f" - Speed device regressions: {len(speed_device_regressions)}") print(f" - Tokens device regressions: {len(tokens_device_regressions)}") # Show a few examples if any found if device_regressions: print(f"\n Example WER regression:") r = device_regressions[0] print(f" Model: {r['model']}") print(f" Device: {r['device']} on {r['os']}") print(f" Current: {r['current_value']}% WER") print(f" Historical best: {r['best_value']}% WER") print(f" Deviation: +{r['percentage_diff']}%") if speed_device_regressions: print(f"\n Example Speed regression:") r = speed_device_regressions[0] print(f" Model: {r['model']}") print(f" Device: {r['device']} on {r['os']}") print(f" Current: {r['current_value']}x speed") print(f" Historical best: {r['best_value']}x speed") print(f" Slower by: {r['percentage_diff']}%") print("\n✅ TEST 7 PASSED: Real data processed successfully") return True except FileNotFoundError: print("⚠️ dashboard_data/performance_data.json not found, skipping real data test") return True except Exception as e: print(f"❌ Error processing real data: {e}") return False def manual_verification_helper(): """Print data for manual verification""" print("\n" + "="*80) print("MANUAL VERIFICATION HELPER") print("="*80) try: real_data = load_performance_data("dashboard_data/performance_data.json") # Pick a model to analyze in detail models = {} for entry in real_data: model = entry["model"] if model not in models: models[model] = [] models[model].append(entry) # Find a model with multiple entries for model_name, entries in list(models.items())[:3]: # Check first 3 models if len(entries) >= 3: print(f"\n📊 Model: {model_name}") print(f" Total data points: {len(entries)}") # Show WER stats wer_values = [e["average_wer"] for e in entries] print(f"\n WER Analysis:") print(f" - Best (min): {min(wer_values):.2f}%") print(f" - Worst (max): {max(wer_values):.2f}%") print(f" - Difference: {((max(wer_values) - min(wer_values)) / min(wer_values) * 100):.1f}%") # Show by device devices = {} for entry in entries: device = entry["device"] if device not in devices: devices[device] = [] devices[device].append(entry["average_wer"]) print(f"\n WER by Device:") for device, wers in devices.items(): avg_wer = sum(wers) / len(wers) num_samples = len(wers) print(f" - {device}: {avg_wer:.2f}% avg ({num_samples} test runs)") # Show speed stats if available if "speed" in entries[0]: speed_values = [e["speed"] for e in entries] print(f"\n Speed Analysis:") print(f" - Best (max): {max(speed_values):.2f}x") print(f" - Worst (min): {min(speed_values):.2f}x") print(f" - Difference: {((max(speed_values) - min(speed_values)) / max(speed_values) * 100):.1f}%") break print("\n" + "="*80) print("Use the above data to manually verify regression detection logic") print("="*80) except Exception as e: print(f"Could not load data for manual verification: {e}") def run_all_tests(): """Run all tests""" print("\n" + "="*80) print("🧪 RUNNING ALL REGRESSION DETECTION TESTS") print("="*80) tests = [ ("WER Detection (Synthetic)", test_wer_detection_with_synthetic_data), ("Speed Detection (Synthetic)", test_speed_detection_with_synthetic_data), ("Tokens Detection (Synthetic)", test_tokens_detection_with_synthetic_data), ("Release Regression Detection", test_release_regression_detection), ("Slack Message Generation", test_slack_message_generation), ("Edge Cases", test_edge_cases), ("Real Data Sample", test_with_real_data_sample), ] passed = 0 failed = 0 for test_name, test_func in tests: try: if test_func(): passed += 1 else: failed += 1 print(f"\n❌ {test_name} FAILED") except AssertionError as e: failed += 1 print(f"\n❌ {test_name} FAILED: {e}") except Exception as e: failed += 1 print(f"\n❌ {test_name} ERROR: {e}") import traceback traceback.print_exc() # Print summary print("\n" + "="*80) print("TEST SUMMARY") print("="*80) print(f"✅ Passed: {passed}/{len(tests)}") print(f"❌ Failed: {failed}/{len(tests)}") if failed == 0: print("\n🎉 ALL TESTS PASSED! The implementation is working correctly.") print("\nNext steps:") print("1. Run manual verification helper to spot-check real data") print("2. Test in a non-production environment first") print("3. Monitor the first few runs carefully") else: print(f"\n⚠️ {failed} test(s) failed. Please review and fix issues.") return False return True if __name__ == "__main__": success = run_all_tests() # Optionally run manual verification helper print("\n" + "="*80) response = input("Run manual verification helper? (y/n): ") if response.lower() == 'y': manual_verification_helper() sys.exit(0 if success else 1)