diff --git a/dsv4/kernels/attention/fmha_mixed_fp8_decode.cuh b/dsv4/kernels/attention/fmha_mixed_fp8_decode.cuh index 65cd4b04..eb56251d 100644 --- a/dsv4/kernels/attention/fmha_mixed_fp8_decode.cuh +++ b/dsv4/kernels/attention/fmha_mixed_fp8_decode.cuh @@ -307,7 +307,7 @@ fmha_mixed_fp8_decode_kernel(FmhaMixedFp8DecodeParams p) { } // B is (K=16 rows, N=16 cols). Reuse BF16 canonical with rows=16 // by embedding into the first 16 rows of a 128-row tile; MMA_N=16. - sV[canon_idx_bf16_16x16(kk, dd)] = vbits; + sV[canon_idx_bf16_16x16(dd, kk)] = vbits; } __syncthreads();