diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_nvfp4_mega_moe.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_nvfp4_mega_moe.cuh index dccf16f..7580ff4 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp8_nvfp4_mega_moe.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_nvfp4_mega_moe.cuh @@ -851,20 +851,21 @@ sm100_fp8_nvfp4_mega_moe_impl(void* y, // DIAGNOSTIC: Force-override instr_desc bitfields // Test 1: Force a_format/b_format to 5 (MXF8F6F4Format::E2M1 encoding) // MXF4Format::E2M1=1 but MXF8F6F4Format::E2M1=5 — hardware may expect 5 + // RESULT: format=5 makes no difference, disabled // Test 2: Force scale_format to 1 (E8M0) to see if bit 23 matters // Test 3: a_sf_id/b_sf_id already set by make_runtime_instr_desc_with_sf_id - { - uint32_t raw = static_cast(instr_desc); - // Clear a_format [7,10) and b_format [10,13), then OR in 5 for both - raw = (raw & ~((0x7u << 7) | (0x7u << 10))) | (5u << 7) | (5u << 10); - // Force scale_format bit [23] to 1 (E8M0) - // raw |= (1u << 23); // uncomment to test scale_fmt=1 - instr_desc = *reinterpret_cast(&raw); - if (lane_idx == 0) { - printf("[DIAG-FORCE] after override: raw=0x%08x a_fmt=%u b_fmt=%u scale_fmt=%u\n", - raw, (raw >> 7) & 7, (raw >> 10) & 7, (raw >> 23) & 1); - } - } + // { + // uint32_t raw = static_cast(instr_desc); + // // Clear a_format [7,10) and b_format [10,13), then OR in 5 for both + // raw = (raw & ~((0x7u << 7) | (0x7u << 10))) | (5u << 7) | (5u << 10); + // // Force scale_format bit [23] to 1 (E8M0) + // // raw |= (1u << 23); // uncomment to test scale_fmt=1 + // instr_desc = *reinterpret_cast(&raw); + // if (lane_idx == 0) { + // printf("[DIAG-FORCE] after override: raw=0x%08x a_fmt=%u b_fmt=%u scale_fmt=%u\n", + // raw, (raw >> 7) & 7, (raw >> 10) & 7, (raw >> 23) & 1); + // } + // } // Wait tensor memory empty barrier arrival const auto accum_stage_idx = current_iter_idx % kNumEpilogueStages;