fix: use scale_vec::2X (block32) for SM100 B200 compatibility
scale_vec::4X (block16) requires SM103/SM120 (B300/GB300), not SM100 (B200). Revert to block32 with UE4M3 scales. Same TMEM layout as MXFP4 but with UE4M3 scale format instead of UE8M0. Changes: - kGranK: 16 → 32 - PTX: scale_vec::4X → scale_vec::2X - SF layout: same as MXFP4 (K/32, K/128 for int32 packed) - UTCCP: i*8 → i*4 (2X layout, same as MXFP4) - TMEM columns: same as MXFP4 (SF_BLOCK_M/32, SF_BLOCK_N/32) - Python: merge NVFP4 block16→block32 scales (max of adjacent pairs) - recipe: (1,1,16) → (1,1,32)
This commit is contained in:
@@ -98,9 +98,9 @@ sm100_fp8_nvfp4_mega_moe_impl(void* y,
|
||||
constexpr auto fp8_token_layout = layout::Data(kHidden);
|
||||
constexpr auto bf16_token_layout = layout::Data(kHidden * sizeof(nv_bfloat16));
|
||||
constexpr auto fp8_intermediate_token_layout = layout::Data(kIntermediateHidden);
|
||||
// NVFP4: group_size=16, so SF stride is K/16 (twice as many scales as MXFP4)
|
||||
constexpr auto fp8_sf_layout = layout::Data(kHidden / 16);
|
||||
constexpr auto fp8_intermediate_sf_layout = layout::Data(kIntermediateHidden / 16);
|
||||
// NVFP4: scale_vec::2X (block32) on SM100, same SF stride as MXFP4
|
||||
constexpr auto fp8_sf_layout = layout::Data(kHidden / 32);
|
||||
constexpr auto fp8_intermediate_sf_layout = layout::Data(kIntermediateHidden / 32);
|
||||
constexpr auto input_topk_idx_layout = layout::Data(kNumTopk * sizeof(int64_t), false);
|
||||
constexpr auto input_topk_weights_layout = layout::Data(kNumTopk * sizeof(float), false);
|
||||
constexpr auto l1_topk_weights_layout = layout::Data(sizeof(float), false);
|
||||
@@ -120,8 +120,10 @@ sm100_fp8_nvfp4_mega_moe_impl(void* y,
|
||||
input_topk_idx_buffer.get_end_ptr());
|
||||
|
||||
// SF and its buffer configs
|
||||
// NVFP4: group_size=16 → kGranK=16 (vs MXFP4's 32)
|
||||
constexpr uint32_t kGranK = 16;
|
||||
// NVFP4 on SM100: scale_vec::2X (block32), group_size=32 with UE4M3 scales
|
||||
// Note: scale_vec::4X (block16) requires SM103/SM120 (B300/GB300), not SM100
|
||||
// So we use block32 and merge pairs of NVFP4 block16 scales
|
||||
constexpr uint32_t kGranK = 32;
|
||||
// For NVFP4 scale_vec::4X, UTCCP alignment is still 128 elements
|
||||
constexpr uint32_t kNumUTCCPAlignedElems = 128;
|
||||
DG_STATIC_ASSERT(SF_BLOCK_M == math::constexpr_align(BLOCK_M, kNumUTCCPAlignedElems), "Invalid SF_BLOCK_M");
|
||||
@@ -220,11 +222,9 @@ sm100_fp8_nvfp4_mega_moe_impl(void* y,
|
||||
|
||||
// Tensor memory size
|
||||
constexpr uint32_t kNumAccumTmemCols = UMMA_N * kNumEpilogueStages;
|
||||
// NVFP4: scale_vec::4X → 4 SF per UMMA atom row → 4 TMEM cols per SF row
|
||||
// For bM=128, SFA uses 4 rows × 4 cols = 16 TMEM columns
|
||||
// SFB uses BLOCK_N/32 rows × 4 cols
|
||||
constexpr uint32_t kNumSFATmemCols = SF_BLOCK_M / 32 * 4;
|
||||
constexpr uint32_t kNumSFBTmemCols = SF_BLOCK_N / 32 * 4;
|
||||
// NVFP4 scale_vec::2X: same TMEM layout as MXFP4
|
||||
constexpr uint32_t kNumSFATmemCols = SF_BLOCK_M / 32;
|
||||
constexpr uint32_t kNumSFBTmemCols = SF_BLOCK_N / 32;
|
||||
constexpr uint32_t kNumTmemCols = utils::get_num_aligned_tmem_cols<kNumAccumTmemCols + kNumSFATmemCols + kNumSFBTmemCols>();
|
||||
constexpr uint32_t kTmemStartColOfSFA = kNumAccumTmemCols;
|
||||
constexpr uint32_t kTmemStartColOfSFB = kNumAccumTmemCols + kNumSFATmemCols;
|
||||
@@ -563,9 +563,9 @@ sm100_fp8_nvfp4_mega_moe_impl(void* y,
|
||||
__syncwarp();
|
||||
|
||||
// Load and store SF (overlaps with TMA token load)
|
||||
// NVFP4: group_size=16, 4 UE4M3 scales per uint32
|
||||
constexpr uint32_t kNumSFUint32 = kHidden / 64;
|
||||
DG_STATIC_ASSERT(kNumSFUint32 > 0 and kHidden % 64 == 0, "Invalid SF");
|
||||
// NVFP4 block32: same SF uint32 count as MXFP4
|
||||
constexpr uint32_t kNumSFUint32 = kHidden / 128;
|
||||
DG_STATIC_ASSERT(kNumSFUint32 > 0 and kHidden % 128 == 0, "Invalid SF");
|
||||
const auto remote_sf_ptr = sym_buffer.map(
|
||||
input_sf_buffer.get_data_buffer(src_token_idx).get_base_ptr<uint32_t>(),
|
||||
current_rank_in_expert_idx);
|
||||
@@ -846,21 +846,19 @@ sm100_fp8_nvfp4_mega_moe_impl(void* y,
|
||||
const auto b_desc_base_lo = ptx::exchange(b_desc_lo, stage_idx);
|
||||
if (cute::elect_one_sync()) {
|
||||
// UTCCP copy SFA and SFB to TMEM
|
||||
// NVFP4: scale_vec::4X, each 128-element block → 8 TMEM cols
|
||||
// NVFP4 scale_vec::2X: same layout as MXFP4
|
||||
using cute_utccp_t = cute::SM100_UTCCP_4x32dp128bit_2cta;
|
||||
|
||||
#pragma unroll
|
||||
for (uint32_t i = 0; i < SF_BLOCK_M / kNumUTCCPAlignedElems; ++ i) {
|
||||
auto smem_ptr = smem_sfa[stage_idx] + i * kNumUTCCPAlignedElems;
|
||||
mma::sm100::replace_smem_desc_addr(sf_desc, smem_ptr);
|
||||
// NVFP4 4X: 8 TMEM columns per 128-element SF group
|
||||
cute_utccp_t::copy(sf_desc, kTmemStartColOfSFA + i * 8);
|
||||
cute_utccp_t::copy(sf_desc, kTmemStartColOfSFA + i * 4);
|
||||
}
|
||||
#pragma unroll
|
||||
for (uint32_t i = 0; i < SF_BLOCK_N / kNumUTCCPAlignedElems; ++ i) {
|
||||
auto smem_ptr = smem_sfb[stage_idx] + i * kNumUTCCPAlignedElems;
|
||||
mma::sm100::replace_smem_desc_addr(sf_desc, smem_ptr);
|
||||
cute_utccp_t::copy(sf_desc, kTmemStartColOfSFB + i * 8);
|
||||
cute_utccp_t::copy(sf_desc, kTmemStartColOfSFB + i * 4);
|
||||
}
|
||||
|
||||
// Issue UMMA
|
||||
|
||||
@@ -153,7 +153,7 @@ struct SM100_MMA_MXF4NVF4_2x1SM_SS {
|
||||
"{\n\t"
|
||||
".reg .pred p;\n\t"
|
||||
"setp.ne.b32 p, %4, 0;\n\t"
|
||||
"tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
|
||||
"tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
|
||||
"}\n"
|
||||
:
|
||||
: "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(static_cast<uint32_t>(desc >> 32)), "r"(scale_c),
|
||||
@@ -175,7 +175,7 @@ struct SM100_MMA_MXF4NVF4_SS {
|
||||
"{\n\t"
|
||||
".reg .pred p;\n\t"
|
||||
"setp.ne.b32 p, %4, 0;\n\t"
|
||||
"tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
|
||||
"tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
|
||||
"}\n"
|
||||
:
|
||||
: "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(static_cast<uint32_t>(desc >> 32)), "r"(scale_c),
|
||||
|
||||
Reference in New Issue
Block a user