test: re-enable TMEM zeroing with tmem_base debug
This commit is contained in:
@@ -71,14 +71,25 @@ test_umma_qk_hd16(
|
||||
__syncthreads();
|
||||
uint32_t tmem_base = *sTmemBase;
|
||||
|
||||
// Zero TMEM — skip for now
|
||||
// if (wid == 0) {
|
||||
// for (int col = 0; col < 32; col++) {
|
||||
// tmem_store(tmem_base + col, 0, 0, 0, 0);
|
||||
// }
|
||||
// tmem_fence_store();
|
||||
// }
|
||||
// __syncthreads();
|
||||
// Debug: write tmem_base to output
|
||||
if (tid == 0) {
|
||||
s_out[138] = 999.0f; // sentinel before tmem ops
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Zero TMEM — re-enable with debug
|
||||
if (wid == 0) {
|
||||
for (int col = 0; col < 128; col++) {
|
||||
tmem_store(tmem_base + col, 0, 0, 0, 0);
|
||||
}
|
||||
tmem_fence_store();
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) {
|
||||
s_out[138] = (float)tmem_base; // write tmem_base after tmem ops
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// ================================================================
|
||||
// Load Q and K into SMEM in canonical layout
|
||||
|
||||
Reference in New Issue
Block a user