diff --git a/tests/unit/test_umma_qk.cu b/tests/unit/test_umma_qk.cu index cdd0175e..7884a279 100644 --- a/tests/unit/test_umma_qk.cu +++ b/tests/unit/test_umma_qk.cu @@ -70,7 +70,19 @@ test_umma_qk_hd16( } __syncthreads(); - // Call tcgen05.mma SS + // Verify SMEM layout: read back Q from sQ and compare with original + if (tid == 0) { + // Q row 0 is in canonical layout at: + // core(0,0): offset 0, local_r=0, local_c=0..7 → indices 0..7 + // core(0,1): offset 16*64=1024, local_r=0, local_c=0..7 → indices 1024..1031 + for (int d = 0; d < 16; d++) { + int core_k = d / 8; + int local_c = d % 8; + int idx = core_k * 16 * 64 + local_c; // tile_mn=0, local_r=0 + s_out[160 + d] = bf16_to_f32(sQ[idx]); + } + } + __syncthreads(); if (tid == 0) { umma_ss_f16(tmem_base, desc_q, desc_k, idesc, /*accumulate=*/false); }