From 94b30dc2bc5855dae160e274d336285aaf743fdf Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 12 May 2026 15:04:23 +0000 Subject: [PATCH] =?UTF-8?q?revert:=20block=5Fn/4=20was=20correct=20(SwiGLU?= =?UTF-8?q?=20halving=20=C3=97=20FP4=20packing)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp b/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp index 12a78d3..f99de14 100644 --- a/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp +++ b/csrc/jit_kernels/impls/sm100_fp8_nvfp4_mega_moe.hpp @@ -157,10 +157,10 @@ static void sm100_fp8_nvfp4_mega_moe( intermediate_hidden * 2, hidden, config.block_n, kGranK, num_experts_per_rank, 0); - // L1 output: packed E2M1, K-dim = intermediate_hidden/2, inner = block_n/2 bytes (packed), no swizzle (v1) + // L1 output: packed E2M1, K-dim = intermediate_hidden/2, inner = block_n/4 bytes (SwiGLU halving × FP4 packing), no swizzle (v1) const auto tensor_map_l1_output = make_tma_2d_desc(l2_acts, intermediate_hidden / 2, config.num_max_pool_tokens, - config.block_n / 2, config.store_block_m, + config.block_n / 4, config.store_block_m, static_cast(l2_acts.stride(-2)), 0, 0, // no swizzle false, // allow_tf32