lhallee commited on
Commit
02f00c4
·
verified ·
1 Parent(s): e517463

Upload entrypoint_setup.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. entrypoint_setup.py +22 -22
entrypoint_setup.py CHANGED
@@ -1,22 +1,22 @@
1
- import torch
2
- import torch._inductor.config as inductor_config
3
- import torch._dynamo as dynamo
4
-
5
- # Enable TensorFloat32 tensor cores for float32 matmul (Ampere+ GPUs)
6
- # Provides significant speedup with minimal precision loss
7
- torch.set_float32_matmul_precision('high')
8
-
9
- # Enable TF32 for matrix multiplications and cuDNN operations
10
- torch.backends.cuda.matmul.allow_tf32 = True
11
- torch.backends.cudnn.allow_tf32 = True
12
-
13
- # Enable cuDNN autotuner - finds fastest algorithms for your hardware
14
- # Best when input sizes are consistent; may slow down first iterations
15
- torch.backends.cudnn.benchmark = True
16
-
17
- # Deterministic operations off for speed (set True if reproducibility needed)
18
- torch.backends.cudnn.deterministic = False
19
- inductor_config.max_autotune_gemm_backends = "ATEN,CUTLASS,FBGEMM"
20
-
21
- dynamo.config.capture_scalar_outputs = True
22
- torch._dynamo.config.recompile_limit = 16
 
1
+ import torch
2
+ import torch._inductor.config as inductor_config
3
+ import torch._dynamo as dynamo
4
+
5
+ # Enable TensorFloat32 tensor cores for float32 matmul (Ampere+ GPUs)
6
+ # Provides significant speedup with minimal precision loss
7
+ torch.set_float32_matmul_precision('high')
8
+
9
+ # Enable TF32 for matrix multiplications and cuDNN operations
10
+ torch.backends.cuda.matmul.allow_tf32 = True
11
+ torch.backends.cudnn.allow_tf32 = True
12
+
13
+ # Enable cuDNN autotuner - finds fastest algorithms for your hardware
14
+ # Best when input sizes are consistent; may slow down first iterations
15
+ torch.backends.cudnn.benchmark = True
16
+
17
+ # Deterministic operations off for speed (set True if reproducibility needed)
18
+ torch.backends.cudnn.deterministic = False
19
+ inductor_config.max_autotune_gemm_backends = "ATEN,CUTLASS,FBGEMM"
20
+
21
+ dynamo.config.capture_scalar_outputs = True
22
+ torch._dynamo.config.recompile_limit = 16