diff --git a/dsv4/kernels/attention/fmha_qk_verify.cuh b/dsv4/kernels/attention/fmha_qk_verify.cuh index 396b6ca8..3d4628d6 100644 --- a/dsv4/kernels/attention/fmha_qk_verify.cuh +++ b/dsv4/kernels/attention/fmha_qk_verify.cuh @@ -48,6 +48,9 @@ fmha_qk_verify( } __syncthreads(); + // SKIP TMEM — just test SMEM loads and scalar QK + // No TMEM alloc, no MMA + /* // TMEM alloc for S: 128 columns if (wid == 0) { uint32_t smem_ptr = __cvta_generic_to_shared(sTmemBase); @@ -55,6 +58,8 @@ fmha_qk_verify( } __syncthreads(); uint32_t tmem_base = *sTmemBase; + */ + uint32_t tmem_base = 0; // dummy // Zero TMEM S if (wid == 0) {