D1: Reduce kv_stage to 1 at hd>128 to avoid SMEM overflow

2026-05-24 03:55:44 +00:00
parent 39367265e5
commit 6597a1cd16
1 changed files with 1 additions and 1 deletions
--- a/dsv4/kernels/attention/fmha.py
+++ b/dsv4/kernels/attention/fmha.py
@@ -30,7 +30,7 @@ class FmhaKernel:
        self.cluster_shape_mn = (1, 1); self.cta_group = tcgen05.CtaGroup.ONE
        self.epilogue_warp_id = (0,1,2,3); self.mma_warp_id = 4; self.tma_warp_id = 5
        self.threads_per_cta = 192; self.num_c_stage = 2
-        self.kv_stage = 2; self.q_stage = 1; self.num_c_stage = 2
+        self.kv_stage = 1 if head_dim > 128 else 2  # Reduce SMEM at large hd; self.q_stage = 1; self.num_c_stage = 2
        self.scale_softmax = scale_softmax if scale_softmax is not None else 1.0 / math.sqrt(self.head_dim)
        self.scale_softmax_log2 = self.scale_softmax * math.log2(math.e)