fix: remove smem_inner_dim doubling for packed FP4 TMA — must match MMA row width (BLOCK_K/2)

This commit is contained in:
2026-05-12 17:14:44 +00:00
parent b95f9eb446
commit 75f1c8544b

View File

@@ -124,9 +124,9 @@ static CUtensorMap make_tma_2d_desc(const torch::Tensor& t,
// Inner dim must be a multiple of 64B for .b4x16_p64
DG_HOST_ASSERT(not fp4_unpacked_smem or gmem_inner_dim % 128 == 0);
// Fix FP4 packed smem
if (not fp4_unpacked_smem and swizzle_mode != 0)
smem_inner_dim = swizzle_mode * 2;
// For packed FP4 (mxf4nvf4): smem_inner_dim must match the MMA's expected
// SMEM row width (BLOCK_K/2 bytes). The default swizzle/elem_size gives the
// correct value — do NOT double it. The TMA and MMA must agree on row width.
}
CUtensorMap tensor_map;