whisperkit-benchmarks / .github /scripts /test_wer_regression_check.py
ardaatahan's picture
Update logic and add tests
6921fc0
#!/usr/bin/env python3
"""
Test script for WER regression detection
Tests all regression detection functions with synthetic and real data
"""
import json
import sys
from wer_regression_check import (
detect_device_regressions,
detect_os_regressions,
detect_release_regressions,
detect_speed_device_regressions,
detect_speed_os_regressions,
detect_speed_release_regressions,
detect_tokens_device_regressions,
detect_tokens_os_regressions,
detect_tokens_release_regressions,
generate_slack_message,
load_performance_data
)
def test_wer_detection_with_synthetic_data():
"""Test WER detection with known synthetic data"""
print("\n" + "="*80)
print("TEST 1: WER Detection with Synthetic Data")
print("="*80)
# Create synthetic data where we know there should be regressions
# Historical data (best performances)
historical_data = [
# Model A: iPhone has best WER of 10%
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0},
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0},
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.2, "speed": 9.8, "tokens_per_second": 49.0},
# Model B: iOS 17 has best WER of 10%
{"model": "model-b", "device": "iPhone 15", "os": "iOS 17", "average_wer": 10.0, "speed": 20.0, "tokens_per_second": 100.0},
{"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 19.0, "tokens_per_second": 95.0},
# Model C: No regression scenario
{"model": "model-c", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0},
{"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0},
]
# Current data (latest release with regressions)
current_data = [
# Model A: iPad Pro has regressed to 15% WER (50% worse than best 10%)
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 15.0, "speed": 8.0, "tokens_per_second": 40.0},
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.3, "speed": 9.7, "tokens_per_second": 49.5},
# Model B: iOS 18 has regressed to 13% WER (30% worse than best 10%)
{"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 13.0, "speed": 15.0, "tokens_per_second": 75.0},
# Model C: Still within 20% (11% vs best 10%)
{"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 9.0, "tokens_per_second": 45.0},
]
# Test device regressions
device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0)
print(f"\nβœ“ Device WER Regressions Found: {len(device_regressions)}")
# Debug: print all found regressions
for r in device_regressions:
print(f" - {r['model']}: {r['device']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)")
# Model A should trigger (iPad Pro is ~40% worse than iPhone)
# Model C should NOT trigger (iPad Pro is only 10% worse)
assert len(device_regressions) >= 1, f"Expected at least 1 device regression, got {len(device_regressions)}"
# Verify model-a is in the regressions
model_a_regressions = [r for r in device_regressions if r["model"] == "model-a"]
assert len(model_a_regressions) > 0, "Expected model-a to have device regression"
print(f"\nβœ“ Model-a correctly flagged for device regression")
# Test OS regressions
os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0)
print(f"\nβœ“ OS WER Regressions Found: {len(os_regressions)}")
# Debug: print all found OS regressions
for r in os_regressions:
print(f" - {r['model']}: {r['os']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)")
assert len(os_regressions) >= 1, f"Expected at least 1 OS regression, got {len(os_regressions)}"
# Verify model-b is in the regressions
model_b_regressions = [r for r in os_regressions if r["model"] == "model-b"]
assert len(model_b_regressions) > 0, "Expected model-b to have OS regression"
print(f"\nβœ“ Model-b correctly flagged for OS regression")
print("\nβœ… TEST 1 PASSED: WER detection works correctly with synthetic data")
return True
def test_speed_detection_with_synthetic_data():
"""Test speed detection with known synthetic data"""
print("\n" + "="*80)
print("TEST 2: Speed Detection with Synthetic Data")
print("="*80)
# Historical data (best performances)
historical_data = [
# Model A: iPhone has best speed of 100
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 200.0},
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 190.0},
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 98.0, "tokens_per_second": 195.0},
]
# Current data (with speed regression)
current_data = [
# Model A: iPad Pro has regressed to 60 speed (40% slower than best 100)
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 60.0, "tokens_per_second": 120.0},
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.2, "speed": 97.0, "tokens_per_second": 195.0},
]
# Test device speed regressions
speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0)
print(f"\nβœ“ Device Speed Regressions Found: {len(speed_device_regressions)}")
assert len(speed_device_regressions) == 1, f"Expected 1 speed device regression, got {len(speed_device_regressions)}"
print(f" - {speed_device_regressions[0]['model']}: {speed_device_regressions[0]['device']} has {speed_device_regressions[0]['current_value']}x speed vs best {speed_device_regressions[0]['best_value']}x")
print("\nβœ… TEST 2 PASSED: Speed detection works correctly with synthetic data")
return True
def test_tokens_detection_with_synthetic_data():
"""Test tokens per second detection with known synthetic data"""
print("\n" + "="*80)
print("TEST 3: Tokens/Second Detection with Synthetic Data")
print("="*80)
# Historical data (best performances)
historical_data = [
# Model A: iPhone has best tokens/sec of 500
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 98.0, "tokens_per_second": 490.0},
]
# Current data (with tokens/sec regression)
current_data = [
# Model A: iPad Pro has regressed to 300 tokens/sec (40% slower than best 500)
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 80.0, "tokens_per_second": 300.0},
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.1, "speed": 99.0, "tokens_per_second": 495.0},
]
# Test device tokens regressions
tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0)
print(f"\nβœ“ Device Tokens/Sec Regressions Found: {len(tokens_device_regressions)}")
assert len(tokens_device_regressions) == 1, f"Expected 1 tokens device regression, got {len(tokens_device_regressions)}"
print(f" - {tokens_device_regressions[0]['model']}: {tokens_device_regressions[0]['device']} has {tokens_device_regressions[0]['current_value']} tokens/sec vs best {tokens_device_regressions[0]['best_value']}")
print("\nβœ… TEST 3 PASSED: Tokens/sec detection works correctly with synthetic data")
return True
def test_release_regression_detection():
"""Test release-to-release regression detection"""
print("\n" + "="*80)
print("TEST 4: Release-to-Release Regression Detection")
print("="*80)
# Previous release data (best performance)
previous_data = [
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 490.0},
]
# Current release data (degraded performance - 50% worse)
current_data = [
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 15.0, "speed": 60.0, "tokens_per_second": 300.0},
]
# Test WER release regression
wer_release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0)
print(f"\nβœ“ WER Release Regressions Found: {len(wer_release_regressions)}")
assert len(wer_release_regressions) == 1, f"Expected 1 WER release regression, got {len(wer_release_regressions)}"
print(f" - {wer_release_regressions[0]['model']}: WER increased from {wer_release_regressions[0]['best_historical_value']}% to {wer_release_regressions[0]['current_value']}%")
# Test speed release regression
speed_release_regressions = detect_speed_release_regressions(current_data, previous_data, threshold=20.0)
print(f"\nβœ“ Speed Release Regressions Found: {len(speed_release_regressions)}")
assert len(speed_release_regressions) == 1, f"Expected 1 speed release regression, got {len(speed_release_regressions)}"
print(f" - {speed_release_regressions[0]['model']}: Speed decreased from {speed_release_regressions[0]['best_historical_value']}x to {speed_release_regressions[0]['current_value']}x")
# Test tokens release regression
tokens_release_regressions = detect_tokens_release_regressions(current_data, previous_data, threshold=20.0)
print(f"\nβœ“ Tokens/Sec Release Regressions Found: {len(tokens_release_regressions)}")
assert len(tokens_release_regressions) == 1, f"Expected 1 tokens release regression, got {len(tokens_release_regressions)}"
print(f" - {tokens_release_regressions[0]['model']}: Tokens/sec decreased from {tokens_release_regressions[0]['best_historical_value']} to {tokens_release_regressions[0]['current_value']}")
print("\nβœ… TEST 4 PASSED: Release-to-release regression detection works correctly")
return True
def test_slack_message_generation():
"""Test Slack message generation"""
print("\n" + "="*80)
print("TEST 5: Slack Message Generation")
print("="*80)
# Create sample regressions
sample_regressions = [
{
"type": "device_wer_discrepancy",
"metric": "WER",
"model": "test-model",
"device": "iPad Pro",
"current_value": 35.0,
"best_value": 25.0,
"best_device": "iPhone 15",
"best_os": "iOS 18",
"percentage_diff": 40.0
},
{
"type": "device_speed_discrepancy",
"metric": "Speed",
"model": "test-model",
"device": "iPad Pro",
"current_value": 60.0,
"best_value": 100.0,
"best_device": "iPhone 15",
"best_os": "iOS 18",
"percentage_diff": 40.0
}
]
# Generate Slack message
slack_payload = generate_slack_message(sample_regressions)
assert slack_payload is not None, "Expected Slack payload to be generated"
assert "blocks" in slack_payload, "Expected 'blocks' in Slack payload"
assert len(slack_payload["blocks"]) > 0, "Expected at least one block in Slack payload"
print(f"\nβœ“ Slack Message Generated Successfully")
print(f" - Total blocks: {len(slack_payload['blocks'])}")
print(f"\nπŸ“§ Full Slack Message Payload:")
print("=" * 80)
print(json.dumps(slack_payload, indent=2))
print("=" * 80)
print("\nβœ… TEST 5 PASSED: Slack message generation works correctly")
return True
def test_edge_cases():
"""Test edge cases"""
print("\n" + "="*80)
print("TEST 6: Edge Cases")
print("="*80)
# Test with single data point (should not trigger any regressions - no historical comparison)
single_current = [
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
]
empty_historical = []
device_regressions = detect_device_regressions(single_current, empty_historical, threshold=20.0)
assert len(device_regressions) == 0, f"Expected 0 regressions with no historical data, got {len(device_regressions)}"
print("βœ“ Single data point with no historical data handled correctly (no regressions)")
# Test with empty current data
empty_regressions = detect_device_regressions([], single_current, threshold=20.0)
assert len(empty_regressions) == 0, "Expected 0 regressions with empty current data"
print("βœ“ Empty current data handled correctly")
# Test with missing fields (tokens_per_second missing)
partial_historical = [
{"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0},
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0},
]
partial_current = [
{"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 30.0, "speed": 80.0},
]
# Should still work for WER and speed
device_regressions = detect_device_regressions(partial_current, partial_historical, threshold=20.0)
print(f"βœ“ Partial data (missing tokens) handled correctly: {len(device_regressions)} WER regressions found")
# Should not crash for tokens
tokens_regressions = detect_tokens_device_regressions(partial_current, partial_historical, threshold=20.0)
assert len(tokens_regressions) == 0, "Expected 0 tokens regressions when field is missing"
print("βœ“ Missing tokens_per_second field handled gracefully")
print("\nβœ… TEST 6 PASSED: Edge cases handled correctly")
return True
def test_with_real_data_sample():
"""Test with a small sample of real data to verify calculations"""
print("\n" + "="*80)
print("TEST 7: Real Data Sample Verification")
print("="*80)
try:
# Load a sample of real data
real_data = load_performance_data("dashboard_data/performance_data.json")
if len(real_data) == 0:
print("⚠️ No real data found, skipping this test")
return True
print(f"βœ“ Loaded {len(real_data)} real data points")
# Get unique models
models = set(entry["model"] for entry in real_data)
print(f"βœ“ Found {len(models)} unique models")
# Split into current (last 10%) and historical (all data) for testing
split_point = int(len(real_data) * 0.9)
historical_data = real_data[:split_point] if split_point > 0 else real_data
current_data = real_data[split_point:] if split_point > 0 else real_data[:10]
# Run detection on real data
device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0)
os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0)
speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0)
tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0)
print(f"\nβœ“ Real Data Analysis:")
print(f" - WER device regressions: {len(device_regressions)}")
print(f" - WER OS regressions: {len(os_regressions)}")
print(f" - Speed device regressions: {len(speed_device_regressions)}")
print(f" - Tokens device regressions: {len(tokens_device_regressions)}")
# Show a few examples if any found
if device_regressions:
print(f"\n Example WER regression:")
r = device_regressions[0]
print(f" Model: {r['model']}")
print(f" Device: {r['device']} on {r['os']}")
print(f" Current: {r['current_value']}% WER")
print(f" Historical best: {r['best_value']}% WER")
print(f" Deviation: +{r['percentage_diff']}%")
if speed_device_regressions:
print(f"\n Example Speed regression:")
r = speed_device_regressions[0]
print(f" Model: {r['model']}")
print(f" Device: {r['device']} on {r['os']}")
print(f" Current: {r['current_value']}x speed")
print(f" Historical best: {r['best_value']}x speed")
print(f" Slower by: {r['percentage_diff']}%")
print("\nβœ… TEST 7 PASSED: Real data processed successfully")
return True
except FileNotFoundError:
print("⚠️ dashboard_data/performance_data.json not found, skipping real data test")
return True
except Exception as e:
print(f"❌ Error processing real data: {e}")
return False
def manual_verification_helper():
"""Print data for manual verification"""
print("\n" + "="*80)
print("MANUAL VERIFICATION HELPER")
print("="*80)
try:
real_data = load_performance_data("dashboard_data/performance_data.json")
# Pick a model to analyze in detail
models = {}
for entry in real_data:
model = entry["model"]
if model not in models:
models[model] = []
models[model].append(entry)
# Find a model with multiple entries
for model_name, entries in list(models.items())[:3]: # Check first 3 models
if len(entries) >= 3:
print(f"\nπŸ“Š Model: {model_name}")
print(f" Total data points: {len(entries)}")
# Show WER stats
wer_values = [e["average_wer"] for e in entries]
print(f"\n WER Analysis:")
print(f" - Best (min): {min(wer_values):.2f}%")
print(f" - Worst (max): {max(wer_values):.2f}%")
print(f" - Difference: {((max(wer_values) - min(wer_values)) / min(wer_values) * 100):.1f}%")
# Show by device
devices = {}
for entry in entries:
device = entry["device"]
if device not in devices:
devices[device] = []
devices[device].append(entry["average_wer"])
print(f"\n WER by Device:")
for device, wers in devices.items():
avg_wer = sum(wers) / len(wers)
num_samples = len(wers)
print(f" - {device}: {avg_wer:.2f}% avg ({num_samples} test runs)")
# Show speed stats if available
if "speed" in entries[0]:
speed_values = [e["speed"] for e in entries]
print(f"\n Speed Analysis:")
print(f" - Best (max): {max(speed_values):.2f}x")
print(f" - Worst (min): {min(speed_values):.2f}x")
print(f" - Difference: {((max(speed_values) - min(speed_values)) / max(speed_values) * 100):.1f}%")
break
print("\n" + "="*80)
print("Use the above data to manually verify regression detection logic")
print("="*80)
except Exception as e:
print(f"Could not load data for manual verification: {e}")
def run_all_tests():
"""Run all tests"""
print("\n" + "="*80)
print("πŸ§ͺ RUNNING ALL REGRESSION DETECTION TESTS")
print("="*80)
tests = [
("WER Detection (Synthetic)", test_wer_detection_with_synthetic_data),
("Speed Detection (Synthetic)", test_speed_detection_with_synthetic_data),
("Tokens Detection (Synthetic)", test_tokens_detection_with_synthetic_data),
("Release Regression Detection", test_release_regression_detection),
("Slack Message Generation", test_slack_message_generation),
("Edge Cases", test_edge_cases),
("Real Data Sample", test_with_real_data_sample),
]
passed = 0
failed = 0
for test_name, test_func in tests:
try:
if test_func():
passed += 1
else:
failed += 1
print(f"\n❌ {test_name} FAILED")
except AssertionError as e:
failed += 1
print(f"\n❌ {test_name} FAILED: {e}")
except Exception as e:
failed += 1
print(f"\n❌ {test_name} ERROR: {e}")
import traceback
traceback.print_exc()
# Print summary
print("\n" + "="*80)
print("TEST SUMMARY")
print("="*80)
print(f"βœ… Passed: {passed}/{len(tests)}")
print(f"❌ Failed: {failed}/{len(tests)}")
if failed == 0:
print("\nπŸŽ‰ ALL TESTS PASSED! The implementation is working correctly.")
print("\nNext steps:")
print("1. Run manual verification helper to spot-check real data")
print("2. Test in a non-production environment first")
print("3. Monitor the first few runs carefully")
else:
print(f"\n⚠️ {failed} test(s) failed. Please review and fix issues.")
return False
return True
if __name__ == "__main__":
success = run_all_tests()
# Optionally run manual verification helper
print("\n" + "="*80)
response = input("Run manual verification helper? (y/n): ")
if response.lower() == 'y':
manual_verification_helper()
sys.exit(0 if success else 1)