#!/usr/bin/env python3
"""
CUDA Test Script for Speech Transcription App

This script helps users verify their CUDA setup and test performance
between CPU and GPU configurations.

Usage:
    python test_cuda.py
"""

import os
import sys
import time
import torch
import numpy as np
from dotenv import load_dotenv

def print_header(title):
    """Print a formatted header"""
    print("\n" + "=" * 60)
    print(f" {title}")
    print("=" * 60)

def print_section(title):
    """Print a formatted section header"""
    print(f"\n🔍 {title}")
    print("-" * 40)

def test_pytorch_cuda():
    """Test PyTorch CUDA availability and performance"""
    print_section("PyTorch CUDA Test")

    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")

    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"cuDNN version: {torch.backends.cudnn.version()}")
        print(f"Number of CUDA devices: {torch.cuda.device_count()}")

        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            print(f"Device {i}: {props.name}")
            print(f"  Memory: {props.total_memory / 1e9:.1f} GB")
            print(f"  Compute capability: {props.major}.{props.minor}")
    else:
        print("❌ CUDA not available")
        return False

    return True

def test_transformers_device():
    """Test transformers library device detection"""
    print_section("Transformers Device Test")

    try:
        from transformers import pipeline

        # Test with CPU
        print("Testing CPU pipeline...")
        start_time = time.time()
        pipe_cpu = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device=-1)
        result_cpu = pipe_cpu("This is a test sentence")
        cpu_time = time.time() - start_time
        print(f"✅ CPU pipeline loaded in {cpu_time:.2f}s")
        print(f"Result: {result_cpu}")

        # Test with CUDA if available
        if torch.cuda.is_available():
            print("\nTesting CUDA pipeline...")
            start_time = time.time()
            pipe_cuda = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device=0)
            result_cuda = pipe_cuda("This is a test sentence")
            cuda_time = time.time() - start_time
            print(f"✅ CUDA pipeline loaded in {cuda_time:.2f}s")
            print(f"Result: {result_cuda}")

            speedup = cpu_time / cuda_time if cuda_time > 0 else 0
            print(f"\n🚀 Speedup: {speedup:.2f}x faster with CUDA")

        return True

    except Exception as e:
        print(f"❌ Error testing transformers: {e}")
        return False

def test_whisper_models():
    """Test Whisper model loading with different devices"""
    print_section("Whisper Model Test")

    try:
        from faster_whisper import WhisperModel

        # Test CPU model
        print("Testing Whisper on CPU...")
        start_time = time.time()
        model_cpu = WhisperModel("tiny.en", device="cpu", compute_type="int8")
        cpu_load_time = time.time() - start_time
        print(f"✅ CPU model loaded in {cpu_load_time:.2f}s")

        # Test CUDA model if available
        if torch.cuda.is_available():
            print("\nTesting Whisper on CUDA...")
            start_time = time.time()
            try:
                model_cuda = WhisperModel("tiny.en", device="cuda", compute_type="float16")
                cuda_load_time = time.time() - start_time
                print(f"✅ CUDA model loaded in {cuda_load_time:.2f}s")

                speedup = cpu_load_time / cuda_load_time if cuda_load_time > 0 else 0
                print(f"🚀 Load speedup: {speedup:.2f}x faster with CUDA")

            except Exception as e:
                print(f"❌ Error loading CUDA model: {e}")
                return False

        return True

    except ImportError:
        print("❌ faster-whisper not installed")
        return False
    except Exception as e:
        print(f"❌ Error testing Whisper: {e}")
        return False

def test_memory_usage():
    """Test GPU memory usage"""
    print_section("GPU Memory Test")

    if not torch.cuda.is_available():
        print("❌ CUDA not available for memory test")
        return False

    # Get initial memory
    torch.cuda.empty_cache()
    initial_memory = torch.cuda.memory_allocated()
    total_memory = torch.cuda.get_device_properties(0).total_memory

    print(f"Total GPU memory: {total_memory / 1e9:.1f} GB")
    print(f"Initial memory usage: {initial_memory / 1e6:.1f} MB")

    # Create a large tensor to test memory
    try:
        test_tensor = torch.randn(1000, 1000, device="cuda")
        allocated_memory = torch.cuda.memory_allocated()
        print(f"Memory after tensor allocation: {allocated_memory / 1e6:.1f} MB")
        print(f"Available memory: {(total_memory - allocated_memory) / 1e9:.1f} GB")

        # Clean up
        del test_tensor
        torch.cuda.empty_cache()
        print("✅ Memory test completed")
        return True

    except Exception as e:
        print(f"❌ Memory test failed: {e}")
        return False

def test_environment_config():
    """Test environment configuration"""
    print_section("Environment Configuration Test")

    # Load .env file if it exists
    env_file = os.path.join(os.path.dirname(__file__), '.env')
    if os.path.exists(env_file):
        load_dotenv(env_file)
        print(f"✅ Found .env file: {env_file}")
    else:
        print(f"ℹ️  No .env file found at: {env_file}")
        print("   Create one from .env.example to configure CUDA usage")

    # Check USE_CUDA setting
    use_cuda = os.getenv('USE_CUDA', 'false').lower() == 'true'
    print(f"USE_CUDA environment variable: {os.getenv('USE_CUDA', 'false')}")
    print(f"Parsed USE_CUDA value: {use_cuda}")

    # Test config import
    try:
        sys.path.append(os.path.dirname(__file__))
        from config import config
        print("✅ Config module imported successfully")

        device_info = config.get_device_info()
        print(f"Selected device: {device_info['device']}")
        print(f"Compute type: {device_info['compute_type']}")

        return True

    except Exception as e:
        print(f"❌ Error importing config: {e}")
        return False

def run_performance_benchmark():
    """Run a simple performance benchmark"""
    print_section("Performance Benchmark")

    if not torch.cuda.is_available():
        print("❌ CUDA not available for benchmark")
        return

    # Matrix multiplication benchmark
    size = 2000
    iterations = 5

    print(f"Running {iterations} matrix multiplications ({size}x{size})...")

    # CPU benchmark
    print("\nCPU benchmark:")
    cpu_times = []
    for i in range(iterations):
        a = torch.randn(size, size)
        b = torch.randn(size, size)

        start_time = time.time()
        c = torch.mm(a, b)
        cpu_time = time.time() - start_time
        cpu_times.append(cpu_time)
        print(f"  Iteration {i+1}: {cpu_time:.3f}s")

    avg_cpu_time = sum(cpu_times) / len(cpu_times)
    print(f"Average CPU time: {avg_cpu_time:.3f}s")

    # CUDA benchmark
    print("\nCUDA benchmark:")
    cuda_times = []
    for i in range(iterations):
        a = torch.randn(size, size, device="cuda")
        b = torch.randn(size, size, device="cuda")

        torch.cuda.synchronize()  # Wait for GPU
        start_time = time.time()
        c = torch.mm(a, b)
        torch.cuda.synchronize()  # Wait for GPU
        cuda_time = time.time() - start_time
        cuda_times.append(cuda_time)
        print(f"  Iteration {i+1}: {cuda_time:.3f}s")

    avg_cuda_time = sum(cuda_times) / len(cuda_times)
    print(f"Average CUDA time: {avg_cuda_time:.3f}s")

    speedup = avg_cpu_time / avg_cuda_time
    print(f"\n🚀 Overall speedup: {speedup:.2f}x faster with CUDA")

def main():
    """Main test function"""
    print_header("CUDA Configuration Test for Speech Transcription App")

    print("This script will test your CUDA setup and help you configure")
    print("the speech transcription app for optimal performance.")

    # Run tests
    tests_passed = 0
    total_tests = 5

    if test_pytorch_cuda():
        tests_passed += 1

    if test_transformers_device():
        tests_passed += 1

    if test_whisper_models():
        tests_passed += 1

    if test_memory_usage():
        tests_passed += 1

    if test_environment_config():
        tests_passed += 1

    # Performance benchmark (optional)
    if torch.cuda.is_available():
        try:
            run_performance_benchmark()
        except Exception as e:
            print(f"❌ Benchmark failed: {e}")

    # Summary
    print_header("Test Summary")
    print(f"Tests passed: {tests_passed}/{total_tests}")

    if tests_passed == total_tests and torch.cuda.is_available():
        print("🎉 All tests passed! Your CUDA setup is working correctly.")
        print("\nTo enable CUDA acceleration:")
        print("1. Create a .env file (copy from .env.example)")
        print("2. Set USE_CUDA=true in the .env file")
        print("3. Run the speech transcription app")
    elif torch.cuda.is_available():
        print("⚠️  Some tests failed. Check the error messages above.")
        print("You may still be able to use CUDA, but with potential issues.")
    else:
        print("ℹ️  CUDA not available. The app will run on CPU.")
        print("This is perfectly fine for most use cases!")

    print("\nFor CPU usage (always works):")
    print("1. Create a .env file (copy from .env.example)")
    print("2. Set USE_CUDA=false in the .env file")
    print("3. Run the speech transcription app")

if __name__ == "__main__":
    main()