test: force 1 K-tile for HD=64 debug

This commit is contained in:
2026-05-28 11:46:12 +00:00
parent abe1870429
commit 73bd21ce01

View File

@@ -29,7 +29,7 @@ test_umma_qk_hd64(const bf16_t* q, const bf16_t* k,
{
const int tid = threadIdx.x;
const int wid = tid / 32, lane = tid % 32;
const int n_ktiles = hd / MMA_K; // 4 for hd=64
const int n_ktiles = 1; // hd / MMA_K; // Force 1 K-tile for debugging
// SMEM: sQ (128, HD) canonical + sK (128, HD) canonical
// Each K-tile of (128, 16) = 4096 bytes