Spaces:
Running
Running
| # /// script | |
| # dependencies = [ | |
| # "torch", | |
| # "numpy", | |
| # ] | |
| # /// | |
| """Simple utilities for running the models.""" | |
| import torch | |
| def to_dtype(dtype_str: str): | |
| """Convert string to torch dtype.""" | |
| if dtype_str == "float16": | |
| return torch.float16 | |
| if dtype_str == "bfloat16": | |
| return torch.bfloat16 | |
| return torch.float32 | |
| def tensor_stats(t: torch.Tensor) -> str: | |
| """Generate stats string for a tensor.""" | |
| return (f"shape={tuple(t.shape)}, " | |
| f"dtype={t.dtype}, " | |
| f"device={t.device}, " | |
| f"mean={t.mean().item():.6f}, " | |
| f"std={t.std().item():.6f}") | |
| def set_seed(seed: int): | |
| """Set seeds for reproducibility.""" | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) | |
| torch.backends.cudnn.deterministic = True | |
| torch.backends.cudnn.benchmark = False | |
| """Reusable benchmarking utilities for performance testing.""" | |
| import time | |
| import numpy as np | |
| from contextlib import contextmanager | |
| from typing import Callable, Dict, Tuple, Any, Optional | |
| import torch | |
| import json | |
| def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20, | |
| input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]: | |
| """High precision timing function with warmup and optional input generation per iteration.""" | |
| # Warmup | |
| for i in range(warmup): | |
| if input_generator: | |
| inputs = input_generator(i) | |
| func(inputs) | |
| else: | |
| func() | |
| if torch.cuda.is_available(): | |
| torch.cuda.synchronize() | |
| start = time.perf_counter() | |
| result = None | |
| for i in range(iters): | |
| if input_generator: | |
| inputs = input_generator(i + warmup) # Continue seed sequence after warmup | |
| result = func(inputs) | |
| else: | |
| result = func() | |
| if torch.cuda.is_available(): | |
| torch.cuda.synchronize() | |
| end = time.perf_counter() | |
| avg_time = (end - start) / iters | |
| return result, avg_time | |
| def memory_usage() -> Dict[str, float]: | |
| """Get current memory usage in GB.""" | |
| if not torch.cuda.is_available(): | |
| return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0} | |
| return { | |
| "allocated": torch.cuda.memory_allocated() / 1024**3, | |
| "cached": torch.cuda.memory_reserved() / 1024**3, | |
| "max_allocated": torch.cuda.max_memory_allocated() / 1024**3 | |
| } | |
| def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None, | |
| tokens: int = None, save_json: Optional[str] = None, | |
| input_shape: Optional[Tuple] = None, input_seed_base: int = 42): | |
| """Context manager for benchmarking with comprehensive metrics and optional input generation.""" | |
| def run_benchmark(model_func, *args, **kwargs): | |
| torch.cuda.empty_cache() if torch.cuda.is_available() else None | |
| mem_before = memory_usage() | |
| # Create input generator if input_shape is provided | |
| input_generator = None | |
| if input_shape is not None: | |
| def create_input(iteration: int): | |
| # Use deterministic but different seed for each iteration | |
| iteration_seed = input_seed_base + iteration * 123 # Spread out seeds | |
| torch.manual_seed(iteration_seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed(iteration_seed) | |
| return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1 | |
| input_generator = create_input | |
| if input_generator: | |
| result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator) | |
| else: | |
| result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters) | |
| mem_after = memory_usage() | |
| # Calculate metrics | |
| metrics = { | |
| "avg_time_ms": avg_time * 1000, | |
| "throughput_tokens_per_sec": tokens / avg_time if tokens else None, | |
| "memory_allocated_gb": mem_after["allocated"], | |
| "memory_cached_gb": mem_after["cached"], | |
| "memory_increase_gb": mem_after["allocated"] - mem_before["allocated"], | |
| "device": str(device) if device else "cpu", | |
| "dtype": str(dtype) if dtype else "float32", | |
| "tokens": tokens, | |
| "warmup_iters": warmup, | |
| "timing_iters": iters | |
| } | |
| # Print results | |
| print(f"Average time: {metrics['avg_time_ms']:.3f} ms") | |
| if tokens: | |
| print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec") | |
| print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB") | |
| print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB") | |
| # Save to JSON if requested | |
| if save_json: | |
| with open(save_json, 'w') as f: | |
| json.dump(metrics, f, indent=2) | |
| return result | |
| yield run_benchmark |