File size: 873 Bytes
f619960 9ff03de | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import torch
import torch._inductor.config as inductor_config
import torch._dynamo as dynamo
# Enable TensorFloat32 tensor cores for float32 matmul (Ampere+ GPUs)
# Provides significant speedup with minimal precision loss
torch.set_float32_matmul_precision('high')
# Enable TF32 for matrix multiplications and cuDNN operations
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Enable cuDNN autotuner - finds fastest algorithms for your hardware
# Best when input sizes are consistent; may slow down first iterations
torch.backends.cudnn.benchmark = True
# Deterministic operations off for speed (set True if reproducibility needed)
torch.backends.cudnn.deterministic = False
inductor_config.max_autotune_gemm_backends = "ATEN,CUTLASS,FBGEMM"
dynamo.config.capture_scalar_outputs = True
torch._dynamo.config.recompile_limit = 16
|