Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Test script for WER regression detection | |
| Tests all regression detection functions with synthetic and real data | |
| """ | |
| import json | |
| import sys | |
| from wer_regression_check import ( | |
| detect_device_regressions, | |
| detect_os_regressions, | |
| detect_release_regressions, | |
| detect_speed_device_regressions, | |
| detect_speed_os_regressions, | |
| detect_speed_release_regressions, | |
| detect_tokens_device_regressions, | |
| detect_tokens_os_regressions, | |
| detect_tokens_release_regressions, | |
| generate_slack_message, | |
| load_performance_data | |
| ) | |
| def test_wer_detection_with_synthetic_data(): | |
| """Test WER detection with known synthetic data""" | |
| print("\n" + "="*80) | |
| print("TEST 1: WER Detection with Synthetic Data") | |
| print("="*80) | |
| # Create synthetic data where we know there should be regressions | |
| # Historical data (best performances) | |
| historical_data = [ | |
| # Model A: iPhone has best WER of 10% | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0}, | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0}, | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.2, "speed": 9.8, "tokens_per_second": 49.0}, | |
| # Model B: iOS 17 has best WER of 10% | |
| {"model": "model-b", "device": "iPhone 15", "os": "iOS 17", "average_wer": 10.0, "speed": 20.0, "tokens_per_second": 100.0}, | |
| {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 19.0, "tokens_per_second": 95.0}, | |
| # Model C: No regression scenario | |
| {"model": "model-c", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0}, | |
| {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0}, | |
| ] | |
| # Current data (latest release with regressions) | |
| current_data = [ | |
| # Model A: iPad Pro has regressed to 15% WER (50% worse than best 10%) | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 15.0, "speed": 8.0, "tokens_per_second": 40.0}, | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.3, "speed": 9.7, "tokens_per_second": 49.5}, | |
| # Model B: iOS 18 has regressed to 13% WER (30% worse than best 10%) | |
| {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 13.0, "speed": 15.0, "tokens_per_second": 75.0}, | |
| # Model C: Still within 20% (11% vs best 10%) | |
| {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 9.0, "tokens_per_second": 45.0}, | |
| ] | |
| # Test device regressions | |
| device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0) | |
| print(f"\nβ Device WER Regressions Found: {len(device_regressions)}") | |
| # Debug: print all found regressions | |
| for r in device_regressions: | |
| print(f" - {r['model']}: {r['device']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)") | |
| # Model A should trigger (iPad Pro is ~40% worse than iPhone) | |
| # Model C should NOT trigger (iPad Pro is only 10% worse) | |
| assert len(device_regressions) >= 1, f"Expected at least 1 device regression, got {len(device_regressions)}" | |
| # Verify model-a is in the regressions | |
| model_a_regressions = [r for r in device_regressions if r["model"] == "model-a"] | |
| assert len(model_a_regressions) > 0, "Expected model-a to have device regression" | |
| print(f"\nβ Model-a correctly flagged for device regression") | |
| # Test OS regressions | |
| os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0) | |
| print(f"\nβ OS WER Regressions Found: {len(os_regressions)}") | |
| # Debug: print all found OS regressions | |
| for r in os_regressions: | |
| print(f" - {r['model']}: {r['os']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)") | |
| assert len(os_regressions) >= 1, f"Expected at least 1 OS regression, got {len(os_regressions)}" | |
| # Verify model-b is in the regressions | |
| model_b_regressions = [r for r in os_regressions if r["model"] == "model-b"] | |
| assert len(model_b_regressions) > 0, "Expected model-b to have OS regression" | |
| print(f"\nβ Model-b correctly flagged for OS regression") | |
| print("\nβ TEST 1 PASSED: WER detection works correctly with synthetic data") | |
| return True | |
| def test_speed_detection_with_synthetic_data(): | |
| """Test speed detection with known synthetic data""" | |
| print("\n" + "="*80) | |
| print("TEST 2: Speed Detection with Synthetic Data") | |
| print("="*80) | |
| # Historical data (best performances) | |
| historical_data = [ | |
| # Model A: iPhone has best speed of 100 | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 200.0}, | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 190.0}, | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 98.0, "tokens_per_second": 195.0}, | |
| ] | |
| # Current data (with speed regression) | |
| current_data = [ | |
| # Model A: iPad Pro has regressed to 60 speed (40% slower than best 100) | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 60.0, "tokens_per_second": 120.0}, | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.2, "speed": 97.0, "tokens_per_second": 195.0}, | |
| ] | |
| # Test device speed regressions | |
| speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0) | |
| print(f"\nβ Device Speed Regressions Found: {len(speed_device_regressions)}") | |
| assert len(speed_device_regressions) == 1, f"Expected 1 speed device regression, got {len(speed_device_regressions)}" | |
| print(f" - {speed_device_regressions[0]['model']}: {speed_device_regressions[0]['device']} has {speed_device_regressions[0]['current_value']}x speed vs best {speed_device_regressions[0]['best_value']}x") | |
| print("\nβ TEST 2 PASSED: Speed detection works correctly with synthetic data") | |
| return True | |
| def test_tokens_detection_with_synthetic_data(): | |
| """Test tokens per second detection with known synthetic data""" | |
| print("\n" + "="*80) | |
| print("TEST 3: Tokens/Second Detection with Synthetic Data") | |
| print("="*80) | |
| # Historical data (best performances) | |
| historical_data = [ | |
| # Model A: iPhone has best tokens/sec of 500 | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0}, | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 98.0, "tokens_per_second": 490.0}, | |
| ] | |
| # Current data (with tokens/sec regression) | |
| current_data = [ | |
| # Model A: iPad Pro has regressed to 300 tokens/sec (40% slower than best 500) | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 80.0, "tokens_per_second": 300.0}, | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.1, "speed": 99.0, "tokens_per_second": 495.0}, | |
| ] | |
| # Test device tokens regressions | |
| tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0) | |
| print(f"\nβ Device Tokens/Sec Regressions Found: {len(tokens_device_regressions)}") | |
| assert len(tokens_device_regressions) == 1, f"Expected 1 tokens device regression, got {len(tokens_device_regressions)}" | |
| print(f" - {tokens_device_regressions[0]['model']}: {tokens_device_regressions[0]['device']} has {tokens_device_regressions[0]['current_value']} tokens/sec vs best {tokens_device_regressions[0]['best_value']}") | |
| print("\nβ TEST 3 PASSED: Tokens/sec detection works correctly with synthetic data") | |
| return True | |
| def test_release_regression_detection(): | |
| """Test release-to-release regression detection""" | |
| print("\n" + "="*80) | |
| print("TEST 4: Release-to-Release Regression Detection") | |
| print("="*80) | |
| # Previous release data (best performance) | |
| previous_data = [ | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0}, | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 490.0}, | |
| ] | |
| # Current release data (degraded performance - 50% worse) | |
| current_data = [ | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 15.0, "speed": 60.0, "tokens_per_second": 300.0}, | |
| ] | |
| # Test WER release regression | |
| wer_release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0) | |
| print(f"\nβ WER Release Regressions Found: {len(wer_release_regressions)}") | |
| assert len(wer_release_regressions) == 1, f"Expected 1 WER release regression, got {len(wer_release_regressions)}" | |
| print(f" - {wer_release_regressions[0]['model']}: WER increased from {wer_release_regressions[0]['best_historical_value']}% to {wer_release_regressions[0]['current_value']}%") | |
| # Test speed release regression | |
| speed_release_regressions = detect_speed_release_regressions(current_data, previous_data, threshold=20.0) | |
| print(f"\nβ Speed Release Regressions Found: {len(speed_release_regressions)}") | |
| assert len(speed_release_regressions) == 1, f"Expected 1 speed release regression, got {len(speed_release_regressions)}" | |
| print(f" - {speed_release_regressions[0]['model']}: Speed decreased from {speed_release_regressions[0]['best_historical_value']}x to {speed_release_regressions[0]['current_value']}x") | |
| # Test tokens release regression | |
| tokens_release_regressions = detect_tokens_release_regressions(current_data, previous_data, threshold=20.0) | |
| print(f"\nβ Tokens/Sec Release Regressions Found: {len(tokens_release_regressions)}") | |
| assert len(tokens_release_regressions) == 1, f"Expected 1 tokens release regression, got {len(tokens_release_regressions)}" | |
| print(f" - {tokens_release_regressions[0]['model']}: Tokens/sec decreased from {tokens_release_regressions[0]['best_historical_value']} to {tokens_release_regressions[0]['current_value']}") | |
| print("\nβ TEST 4 PASSED: Release-to-release regression detection works correctly") | |
| return True | |
| def test_slack_message_generation(): | |
| """Test Slack message generation""" | |
| print("\n" + "="*80) | |
| print("TEST 5: Slack Message Generation") | |
| print("="*80) | |
| # Create sample regressions | |
| sample_regressions = [ | |
| { | |
| "type": "device_wer_discrepancy", | |
| "metric": "WER", | |
| "model": "test-model", | |
| "device": "iPad Pro", | |
| "current_value": 35.0, | |
| "best_value": 25.0, | |
| "best_device": "iPhone 15", | |
| "best_os": "iOS 18", | |
| "percentage_diff": 40.0 | |
| }, | |
| { | |
| "type": "device_speed_discrepancy", | |
| "metric": "Speed", | |
| "model": "test-model", | |
| "device": "iPad Pro", | |
| "current_value": 60.0, | |
| "best_value": 100.0, | |
| "best_device": "iPhone 15", | |
| "best_os": "iOS 18", | |
| "percentage_diff": 40.0 | |
| } | |
| ] | |
| # Generate Slack message | |
| slack_payload = generate_slack_message(sample_regressions) | |
| assert slack_payload is not None, "Expected Slack payload to be generated" | |
| assert "blocks" in slack_payload, "Expected 'blocks' in Slack payload" | |
| assert len(slack_payload["blocks"]) > 0, "Expected at least one block in Slack payload" | |
| print(f"\nβ Slack Message Generated Successfully") | |
| print(f" - Total blocks: {len(slack_payload['blocks'])}") | |
| print(f"\nπ§ Full Slack Message Payload:") | |
| print("=" * 80) | |
| print(json.dumps(slack_payload, indent=2)) | |
| print("=" * 80) | |
| print("\nβ TEST 5 PASSED: Slack message generation works correctly") | |
| return True | |
| def test_edge_cases(): | |
| """Test edge cases""" | |
| print("\n" + "="*80) | |
| print("TEST 6: Edge Cases") | |
| print("="*80) | |
| # Test with single data point (should not trigger any regressions - no historical comparison) | |
| single_current = [ | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0}, | |
| ] | |
| empty_historical = [] | |
| device_regressions = detect_device_regressions(single_current, empty_historical, threshold=20.0) | |
| assert len(device_regressions) == 0, f"Expected 0 regressions with no historical data, got {len(device_regressions)}" | |
| print("β Single data point with no historical data handled correctly (no regressions)") | |
| # Test with empty current data | |
| empty_regressions = detect_device_regressions([], single_current, threshold=20.0) | |
| assert len(empty_regressions) == 0, "Expected 0 regressions with empty current data" | |
| print("β Empty current data handled correctly") | |
| # Test with missing fields (tokens_per_second missing) | |
| partial_historical = [ | |
| {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0}, | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0}, | |
| ] | |
| partial_current = [ | |
| {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 30.0, "speed": 80.0}, | |
| ] | |
| # Should still work for WER and speed | |
| device_regressions = detect_device_regressions(partial_current, partial_historical, threshold=20.0) | |
| print(f"β Partial data (missing tokens) handled correctly: {len(device_regressions)} WER regressions found") | |
| # Should not crash for tokens | |
| tokens_regressions = detect_tokens_device_regressions(partial_current, partial_historical, threshold=20.0) | |
| assert len(tokens_regressions) == 0, "Expected 0 tokens regressions when field is missing" | |
| print("β Missing tokens_per_second field handled gracefully") | |
| print("\nβ TEST 6 PASSED: Edge cases handled correctly") | |
| return True | |
| def test_with_real_data_sample(): | |
| """Test with a small sample of real data to verify calculations""" | |
| print("\n" + "="*80) | |
| print("TEST 7: Real Data Sample Verification") | |
| print("="*80) | |
| try: | |
| # Load a sample of real data | |
| real_data = load_performance_data("dashboard_data/performance_data.json") | |
| if len(real_data) == 0: | |
| print("β οΈ No real data found, skipping this test") | |
| return True | |
| print(f"β Loaded {len(real_data)} real data points") | |
| # Get unique models | |
| models = set(entry["model"] for entry in real_data) | |
| print(f"β Found {len(models)} unique models") | |
| # Split into current (last 10%) and historical (all data) for testing | |
| split_point = int(len(real_data) * 0.9) | |
| historical_data = real_data[:split_point] if split_point > 0 else real_data | |
| current_data = real_data[split_point:] if split_point > 0 else real_data[:10] | |
| # Run detection on real data | |
| device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0) | |
| os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0) | |
| speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0) | |
| tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0) | |
| print(f"\nβ Real Data Analysis:") | |
| print(f" - WER device regressions: {len(device_regressions)}") | |
| print(f" - WER OS regressions: {len(os_regressions)}") | |
| print(f" - Speed device regressions: {len(speed_device_regressions)}") | |
| print(f" - Tokens device regressions: {len(tokens_device_regressions)}") | |
| # Show a few examples if any found | |
| if device_regressions: | |
| print(f"\n Example WER regression:") | |
| r = device_regressions[0] | |
| print(f" Model: {r['model']}") | |
| print(f" Device: {r['device']} on {r['os']}") | |
| print(f" Current: {r['current_value']}% WER") | |
| print(f" Historical best: {r['best_value']}% WER") | |
| print(f" Deviation: +{r['percentage_diff']}%") | |
| if speed_device_regressions: | |
| print(f"\n Example Speed regression:") | |
| r = speed_device_regressions[0] | |
| print(f" Model: {r['model']}") | |
| print(f" Device: {r['device']} on {r['os']}") | |
| print(f" Current: {r['current_value']}x speed") | |
| print(f" Historical best: {r['best_value']}x speed") | |
| print(f" Slower by: {r['percentage_diff']}%") | |
| print("\nβ TEST 7 PASSED: Real data processed successfully") | |
| return True | |
| except FileNotFoundError: | |
| print("β οΈ dashboard_data/performance_data.json not found, skipping real data test") | |
| return True | |
| except Exception as e: | |
| print(f"β Error processing real data: {e}") | |
| return False | |
| def manual_verification_helper(): | |
| """Print data for manual verification""" | |
| print("\n" + "="*80) | |
| print("MANUAL VERIFICATION HELPER") | |
| print("="*80) | |
| try: | |
| real_data = load_performance_data("dashboard_data/performance_data.json") | |
| # Pick a model to analyze in detail | |
| models = {} | |
| for entry in real_data: | |
| model = entry["model"] | |
| if model not in models: | |
| models[model] = [] | |
| models[model].append(entry) | |
| # Find a model with multiple entries | |
| for model_name, entries in list(models.items())[:3]: # Check first 3 models | |
| if len(entries) >= 3: | |
| print(f"\nπ Model: {model_name}") | |
| print(f" Total data points: {len(entries)}") | |
| # Show WER stats | |
| wer_values = [e["average_wer"] for e in entries] | |
| print(f"\n WER Analysis:") | |
| print(f" - Best (min): {min(wer_values):.2f}%") | |
| print(f" - Worst (max): {max(wer_values):.2f}%") | |
| print(f" - Difference: {((max(wer_values) - min(wer_values)) / min(wer_values) * 100):.1f}%") | |
| # Show by device | |
| devices = {} | |
| for entry in entries: | |
| device = entry["device"] | |
| if device not in devices: | |
| devices[device] = [] | |
| devices[device].append(entry["average_wer"]) | |
| print(f"\n WER by Device:") | |
| for device, wers in devices.items(): | |
| avg_wer = sum(wers) / len(wers) | |
| num_samples = len(wers) | |
| print(f" - {device}: {avg_wer:.2f}% avg ({num_samples} test runs)") | |
| # Show speed stats if available | |
| if "speed" in entries[0]: | |
| speed_values = [e["speed"] for e in entries] | |
| print(f"\n Speed Analysis:") | |
| print(f" - Best (max): {max(speed_values):.2f}x") | |
| print(f" - Worst (min): {min(speed_values):.2f}x") | |
| print(f" - Difference: {((max(speed_values) - min(speed_values)) / max(speed_values) * 100):.1f}%") | |
| break | |
| print("\n" + "="*80) | |
| print("Use the above data to manually verify regression detection logic") | |
| print("="*80) | |
| except Exception as e: | |
| print(f"Could not load data for manual verification: {e}") | |
| def run_all_tests(): | |
| """Run all tests""" | |
| print("\n" + "="*80) | |
| print("π§ͺ RUNNING ALL REGRESSION DETECTION TESTS") | |
| print("="*80) | |
| tests = [ | |
| ("WER Detection (Synthetic)", test_wer_detection_with_synthetic_data), | |
| ("Speed Detection (Synthetic)", test_speed_detection_with_synthetic_data), | |
| ("Tokens Detection (Synthetic)", test_tokens_detection_with_synthetic_data), | |
| ("Release Regression Detection", test_release_regression_detection), | |
| ("Slack Message Generation", test_slack_message_generation), | |
| ("Edge Cases", test_edge_cases), | |
| ("Real Data Sample", test_with_real_data_sample), | |
| ] | |
| passed = 0 | |
| failed = 0 | |
| for test_name, test_func in tests: | |
| try: | |
| if test_func(): | |
| passed += 1 | |
| else: | |
| failed += 1 | |
| print(f"\nβ {test_name} FAILED") | |
| except AssertionError as e: | |
| failed += 1 | |
| print(f"\nβ {test_name} FAILED: {e}") | |
| except Exception as e: | |
| failed += 1 | |
| print(f"\nβ {test_name} ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| # Print summary | |
| print("\n" + "="*80) | |
| print("TEST SUMMARY") | |
| print("="*80) | |
| print(f"β Passed: {passed}/{len(tests)}") | |
| print(f"β Failed: {failed}/{len(tests)}") | |
| if failed == 0: | |
| print("\nπ ALL TESTS PASSED! The implementation is working correctly.") | |
| print("\nNext steps:") | |
| print("1. Run manual verification helper to spot-check real data") | |
| print("2. Test in a non-production environment first") | |
| print("3. Monitor the first few runs carefully") | |
| else: | |
| print(f"\nβ οΈ {failed} test(s) failed. Please review and fix issues.") | |
| return False | |
| return True | |
| if __name__ == "__main__": | |
| success = run_all_tests() | |
| # Optionally run manual verification helper | |
| print("\n" + "="*80) | |
| response = input("Run manual verification helper? (y/n): ") | |
| if response.lower() == 'y': | |
| manual_verification_helper() | |
| sys.exit(0 if success else 1) | |