fix: remove smem_inner_dim doubling for packed FP4 TMA — must match MMA row width (BLOCK_K/2)
This commit is contained in:
@@ -124,9 +124,9 @@ static CUtensorMap make_tma_2d_desc(const torch::Tensor& t,
|
||||
// Inner dim must be a multiple of 64B for .b4x16_p64
|
||||
DG_HOST_ASSERT(not fp4_unpacked_smem or gmem_inner_dim % 128 == 0);
|
||||
|
||||
// Fix FP4 packed smem
|
||||
if (not fp4_unpacked_smem and swizzle_mode != 0)
|
||||
smem_inner_dim = swizzle_mode * 2;
|
||||
// For packed FP4 (mxf4nvf4): smem_inner_dim must match the MMA's expected
|
||||
// SMEM row width (BLOCK_K/2 bytes). The default swizzle/elem_size gives the
|
||||
// correct value — do NOT double it. The TMA and MMA must agree on row width.
|
||||
}
|
||||
|
||||
CUtensorMap tensor_map;
|
||||
|
||||
Reference in New Issue
Block a user