Fix weight TMA descriptors: packed E2M1 needs K/2, block_k/2, swizzle/2

Weights are packed E2M1 (2 per byte) but TMA descriptors were using
unpacked dimensions — K-dim in elements instead of bytes, 128B swizzle
instead of 64B, full block_k instead of block_k/2. This caused OOB
reads and swizzle mismatch with the UMMA descriptor, producing
illegal instruction traps.
This commit is contained in:
2026-05-12 06:51:39 +00:00
parent fbfeb54c9a
commit c0850a6859

View File

@@ -148,10 +148,11 @@ static void sm100_fp8_nvfp4_mega_moe(
config.sf_block_m, kGranK,
1, 0);
const auto tensor_map_l1_weights = make_tma_2d_desc(l1_weights,
hidden, num_experts_per_rank * intermediate_hidden * 2,
config.block_k, config.load_block_n,
hidden / 2, num_experts_per_rank * intermediate_hidden * 2,
config.block_k / 2, config.load_block_n,
static_cast<int>(l1_weights.stride(-2)),
config.swizzle_weights_mode);
config.swizzle_weights_mode / 2,
0, false, false); // fp4_unpacked_smem=false (packed!)
const auto tensor_map_l1_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l1_weights_sf,
intermediate_hidden * 2, hidden,
config.block_n, kGranK,
@@ -176,10 +177,11 @@ static void sm100_fp8_nvfp4_mega_moe(
config.sf_block_m, kGranK,
1, 0);
const auto tensor_map_l2_weights = make_tma_2d_desc(l2_weights,
intermediate_hidden, num_experts_per_rank * hidden,
config.block_k, config.load_block_n,
intermediate_hidden / 2, num_experts_per_rank * hidden,
config.block_k / 2, config.load_block_n,
static_cast<int>(l2_weights.stride(-2)),
config.swizzle_weights_mode);
config.swizzle_weights_mode / 2,
0, false, false); // fp4_unpacked_smem=false (packed!)
const auto tensor_map_l2_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l2_weights_sf,
hidden, intermediate_hidden,
config.block_n, kGranK,