test: debug single K-tile from full (128,64) SMEM

This commit is contained in:
2026-05-28 12:55:52 +00:00
parent a7e8b483cd
commit ba2e390e1e

View File

@@ -97,7 +97,7 @@ test_umma_hd64(const bf16_t* __restrict__ q, const bf16_t* __restrict__ k,
uint32_t sK_smem = __cvta_generic_to_shared(sK);
uint32_t idesc = make_idesc(BLOCK_MN, BLOCK_MN);
for (int kt = 0; kt < NKT; kt++) {
for (int kt = 0; kt < 1; kt++) { // DEBUG: single K-tile from full SMEM
// K-tile offset in canonical layout:
// Each 16-BF16 K-tile spans 2 core columns.
// Core column 2*kt starts at offset 2*kt * (128/8 * 128) bytes = 2*kt * 2048 bytes = kt * 4096 bytes.
@@ -144,7 +144,7 @@ test_umma_hd64(const bf16_t* __restrict__ q, const bf16_t* __restrict__ k,
if (tid == 0) {
for (int j = 0; j < SK; j++) {
float dot = 0.0f;
for (int d = 0; d < HD; d++)
for (int d = 0; d < 16; d++) // DEBUG: single K-tile
dot += bf16_to_f32(q[d]) * bf16_to_f32(k[j * HD + d]);
s_scalar[j] = dot * scale;
}