Fix weight TMA descriptors: packed E2M1 needs K/2, block_k/2, swizzle/2
Weights are packed E2M1 (2 per byte) but TMA descriptors were using unpacked dimensions — K-dim in elements instead of bytes, 128B swizzle instead of 64B, full block_k instead of block_k/2. This caused OOB reads and swizzle mismatch with the UMMA descriptor, producing illegal instruction traps.
This commit is contained in:
@@ -148,10 +148,11 @@ static void sm100_fp8_nvfp4_mega_moe(
|
||||
config.sf_block_m, kGranK,
|
||||
1, 0);
|
||||
const auto tensor_map_l1_weights = make_tma_2d_desc(l1_weights,
|
||||
hidden, num_experts_per_rank * intermediate_hidden * 2,
|
||||
config.block_k, config.load_block_n,
|
||||
hidden / 2, num_experts_per_rank * intermediate_hidden * 2,
|
||||
config.block_k / 2, config.load_block_n,
|
||||
static_cast<int>(l1_weights.stride(-2)),
|
||||
config.swizzle_weights_mode);
|
||||
config.swizzle_weights_mode / 2,
|
||||
0, false, false); // fp4_unpacked_smem=false (packed!)
|
||||
const auto tensor_map_l1_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l1_weights_sf,
|
||||
intermediate_hidden * 2, hidden,
|
||||
config.block_n, kGranK,
|
||||
@@ -176,10 +177,11 @@ static void sm100_fp8_nvfp4_mega_moe(
|
||||
config.sf_block_m, kGranK,
|
||||
1, 0);
|
||||
const auto tensor_map_l2_weights = make_tma_2d_desc(l2_weights,
|
||||
intermediate_hidden, num_experts_per_rank * hidden,
|
||||
config.block_k, config.load_block_n,
|
||||
intermediate_hidden / 2, num_experts_per_rank * hidden,
|
||||
config.block_k / 2, config.load_block_n,
|
||||
static_cast<int>(l2_weights.stride(-2)),
|
||||
config.swizzle_weights_mode);
|
||||
config.swizzle_weights_mode / 2,
|
||||
0, false, false); // fp4_unpacked_smem=false (packed!)
|
||||
const auto tensor_map_l2_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l2_weights_sf,
|
||||
hidden, intermediate_hidden,
|
||||
config.block_n, kGranK,
|
||||
|
||||
Reference in New Issue
Block a user