test: force 1 K-tile for HD=64 debug
This commit is contained in:
@@ -29,7 +29,7 @@ test_umma_qk_hd64(const bf16_t* q, const bf16_t* k,
|
||||
{
|
||||
const int tid = threadIdx.x;
|
||||
const int wid = tid / 32, lane = tid % 32;
|
||||
const int n_ktiles = hd / MMA_K; // 4 for hd=64
|
||||
const int n_ktiles = 1; // hd / MMA_K; // Force 1 K-tile for debugging
|
||||
|
||||
// SMEM: sQ (128, HD) canonical + sK (128, HD) canonical
|
||||
// Each K-tile of (128, 16) = 4096 bytes
|
||||
|
||||
Reference in New Issue
Block a user