debug: skip TMEM alloc — test SMEM loads only
This commit is contained in:
@@ -48,6 +48,9 @@ fmha_qk_verify(
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// SKIP TMEM — just test SMEM loads and scalar QK
|
||||
// No TMEM alloc, no MMA
|
||||
/*
|
||||
// TMEM alloc for S: 128 columns
|
||||
if (wid == 0) {
|
||||
uint32_t smem_ptr = __cvta_generic_to_shared(sTmemBase);
|
||||
@@ -55,6 +58,8 @@ fmha_qk_verify(
|
||||
}
|
||||
__syncthreads();
|
||||
uint32_t tmem_base = *sTmemBase;
|
||||
*/
|
||||
uint32_t tmem_base = 0; // dummy
|
||||
|
||||
// Zero TMEM S
|
||||
if (wid == 0) {
|
||||
|
||||
Reference in New Issue
Block a user