diff --git a/tests/unit/test_d1_hd512_only.py b/tests/unit/test_d1_hd512_only.py
index 6a8e314f..8b3a4767 100644
--- a/tests/unit/test_d1_hd512_only.py
+++ b/tests/unit/test_d1_hd512_only.py
@@ -45,11 +45,12 @@ def test():
 
     import time
     t0 = time.time()
-    from cutlass.base_dsl.compiler import CompileOptions, PtxasOptions, OptLevel
-    # PtxasOptions -j64: use 64 threads for ptxas register allocation (B200 has 256 cores)
-    # OptLevel(0): skip MLIR optimizations for faster compilation (verify correctness first, then optimize)
+    from cutlass.base_dsl.compiler import PtxasOptions, OptLevel
+    # OptLevel(0): skip MLIR optimizations for faster compilation.
+    # The bottleneck is the MLIR optimizer (not ptxas), so ptxas -j doesn't help.
+    # Verify correctness first at O0, then re-compile at O3 for production.
     compiled = cute.compile(kernel, mQ, mK, mV, mC, stream, mLSE,
-                           options="--ptxas-options '-j64' --opt-level 0")
+                           options="--opt-level 0")
     t1 = time.time()
     print(f'Compilation took {t1-t0:.1f}s', flush=True)