diff --git a/dsv4/kernels/cuda/indexer_fp8_score_topk.cu b/dsv4/kernels/cuda/indexer_fp8_score_topk.cu index 69975dd2..e2a6f0ae 100644 --- a/dsv4/kernels/cuda/indexer_fp8_score_topk.cu +++ b/dsv4/kernels/cuda/indexer_fp8_score_topk.cu @@ -149,8 +149,7 @@ indexer_fp8_score_topk_kernel( constexpr int MMA_K_F8 = 32; constexpr int NKT = 4; // ihd=128 / MMA_K_F8=32 constexpr int TILE_F8 = 128 * 32; // 4096 bytes per SMEM tile - constexpr int TMEM_COLS = 256; // 128 rows × 128 cols needs 4×128 = 512, - // but only 64 rows used (2×128 = 256) + constexpr int TMEM_COLS = 512; // 128 rows × 128 cols → 4 row-groups × 128 cols = 512 const int tid = threadIdx.x; const int wid = tid >> 5;