Spaces:

drbh
/

compare-moe-uvnote

Running

File size: 5,134 Bytes

bad4ddc

# /// script
# dependencies = [
#     "torch",
#     "numpy",
# ]
# ///

"""Simple utilities for running the models."""
import torch

def to_dtype(dtype_str: str):
    """Convert string to torch dtype."""
    if dtype_str == "float16":
        return torch.float16
    if dtype_str == "bfloat16":
        return torch.bfloat16
    return torch.float32

def tensor_stats(t: torch.Tensor) -> str:
    """Generate stats string for a tensor."""
    return (f"shape={tuple(t.shape)}, "
            f"dtype={t.dtype}, "
            f"device={t.device}, "
            f"mean={t.mean().item():.6f}, "
            f"std={t.std().item():.6f}")

def set_seed(seed: int):
    """Set seeds for reproducibility."""
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

"""Reusable benchmarking utilities for performance testing."""
import time
import numpy as np
from contextlib import contextmanager
from typing import Callable, Dict, Tuple, Any, Optional
import torch
import json

def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20, 
                   input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]:
    """High precision timing function with warmup and optional input generation per iteration."""
    # Warmup
    for i in range(warmup):
        if input_generator:
            inputs = input_generator(i)
            func(inputs)
        else:
            func()
    
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    
    start = time.perf_counter()
    result = None
    for i in range(iters):
        if input_generator:
            inputs = input_generator(i + warmup)  # Continue seed sequence after warmup
            result = func(inputs)
        else:
            result = func()
    
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    
    end = time.perf_counter()
    avg_time = (end - start) / iters
    return result, avg_time

def memory_usage() -> Dict[str, float]:
    """Get current memory usage in GB."""
    if not torch.cuda.is_available():
        return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0}
    
    return {
        "allocated": torch.cuda.memory_allocated() / 1024**3,
        "cached": torch.cuda.memory_reserved() / 1024**3,
        "max_allocated": torch.cuda.max_memory_allocated() / 1024**3
    }

@contextmanager
def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None, 
                  tokens: int = None, save_json: Optional[str] = None,
                  input_shape: Optional[Tuple] = None, input_seed_base: int = 42):
    """Context manager for benchmarking with comprehensive metrics and optional input generation."""
    
    def run_benchmark(model_func, *args, **kwargs):
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        mem_before = memory_usage()
        
        # Create input generator if input_shape is provided
        input_generator = None
        if input_shape is not None:
            def create_input(iteration: int):
                # Use deterministic but different seed for each iteration
                iteration_seed = input_seed_base + iteration * 123  # Spread out seeds
                torch.manual_seed(iteration_seed)
                if torch.cuda.is_available():
                    torch.cuda.manual_seed(iteration_seed)
                return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1
            input_generator = create_input
        
        if input_generator:
            result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator)
        else:
            result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters)
        
        mem_after = memory_usage()
        
        # Calculate metrics
        metrics = {
            "avg_time_ms": avg_time * 1000,
            "throughput_tokens_per_sec": tokens / avg_time if tokens else None,
            "memory_allocated_gb": mem_after["allocated"],
            "memory_cached_gb": mem_after["cached"],
            "memory_increase_gb": mem_after["allocated"] - mem_before["allocated"],
            "device": str(device) if device else "cpu",
            "dtype": str(dtype) if dtype else "float32",
            "tokens": tokens,
            "warmup_iters": warmup,
            "timing_iters": iters
        }
        
        # Print results
        print(f"Average time: {metrics['avg_time_ms']:.3f} ms")
        if tokens:
            print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec")
        print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB")
        print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB")
        
        # Save to JSON if requested
        if save_json:
            with open(save_json, 'w') as f:
                json.dump(metrics, f, indent=2)
        
        return result
    
    yield run_benchmark