From 939b9400f144e2811b24e7cdc4b1d3981b7a587c Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 24 May 2026 15:43:17 +0000 Subject: [PATCH] D1.4: Use --opt-level 0 only (ptxas -j not supported, MLIR is the bottleneck) --- tests/unit/test_d1_hd512_only.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_d1_hd512_only.py b/tests/unit/test_d1_hd512_only.py index 6a8e314f..8b3a4767 100644 --- a/tests/unit/test_d1_hd512_only.py +++ b/tests/unit/test_d1_hd512_only.py @@ -45,11 +45,12 @@ def test(): import time t0 = time.time() - from cutlass.base_dsl.compiler import CompileOptions, PtxasOptions, OptLevel - # PtxasOptions -j64: use 64 threads for ptxas register allocation (B200 has 256 cores) - # OptLevel(0): skip MLIR optimizations for faster compilation (verify correctness first, then optimize) + from cutlass.base_dsl.compiler import PtxasOptions, OptLevel + # OptLevel(0): skip MLIR optimizations for faster compilation. + # The bottleneck is the MLIR optimizer (not ptxas), so ptxas -j doesn't help. + # Verify correctness first at O0, then re-compile at O3 for production. compiled = cute.compile(kernel, mQ, mK, mV, mC, stream, mLSE, - options="--ptxas-options '-j64' --opt-level 0") + options="--opt-level 0") t1 = time.time() print(f'Compilation took {t1-t0:.1f}s', flush=True)