D1.4: Use --opt-level 0 only (ptxas -j not supported, MLIR is the bottleneck)
This commit is contained in:
@@ -45,11 +45,12 @@ def test():
|
||||
|
||||
import time
|
||||
t0 = time.time()
|
||||
from cutlass.base_dsl.compiler import CompileOptions, PtxasOptions, OptLevel
|
||||
# PtxasOptions -j64: use 64 threads for ptxas register allocation (B200 has 256 cores)
|
||||
# OptLevel(0): skip MLIR optimizations for faster compilation (verify correctness first, then optimize)
|
||||
from cutlass.base_dsl.compiler import PtxasOptions, OptLevel
|
||||
# OptLevel(0): skip MLIR optimizations for faster compilation.
|
||||
# The bottleneck is the MLIR optimizer (not ptxas), so ptxas -j doesn't help.
|
||||
# Verify correctness first at O0, then re-compile at O3 for production.
|
||||
compiled = cute.compile(kernel, mQ, mK, mV, mC, stream, mLSE,
|
||||
options="--ptxas-options '-j64' --opt-level 0")
|
||||
options="--opt-level 0")
|
||||
t1 = time.time()
|
||||
print(f'Compilation took {t1-t0:.1f}s', flush=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user