test: debug single K-tile from full (128,64) SMEM
This commit is contained in:
@@ -97,7 +97,7 @@ test_umma_hd64(const bf16_t* __restrict__ q, const bf16_t* __restrict__ k,
|
||||
uint32_t sK_smem = __cvta_generic_to_shared(sK);
|
||||
uint32_t idesc = make_idesc(BLOCK_MN, BLOCK_MN);
|
||||
|
||||
for (int kt = 0; kt < NKT; kt++) {
|
||||
for (int kt = 0; kt < 1; kt++) { // DEBUG: single K-tile from full SMEM
|
||||
// K-tile offset in canonical layout:
|
||||
// Each 16-BF16 K-tile spans 2 core columns.
|
||||
// Core column 2*kt starts at offset 2*kt * (128/8 * 128) bytes = 2*kt * 2048 bytes = kt * 4096 bytes.
|
||||
@@ -144,7 +144,7 @@ test_umma_hd64(const bf16_t* __restrict__ q, const bf16_t* __restrict__ k,
|
||||
if (tid == 0) {
|
||||
for (int j = 0; j < SK; j++) {
|
||||
float dot = 0.0f;
|
||||
for (int d = 0; d < HD; d++)
|
||||
for (int d = 0; d < 16; d++) // DEBUG: single K-tile
|
||||
dot += bf16_to_f32(q[d]) * bf16_to_f32(k[j * HD + d]);
|
||||
s_scalar[j] = dot * scale;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user