diff --git a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu index fb3e05cf..bbed2662 100644 --- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu +++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu @@ -122,7 +122,15 @@ __global__ void remap_sf_to_cutlass_kernel( int k_group = src_idx % K_sf; int k_elem = k_group * SFVecSize; - dst[layout_sf(m, k_elem)] = src[src_idx]; + // Construct a coordinate matching the layout's top-level mode structure. + // The layout shape from tile_to_shape(SfAtom{}, make_shape(MN, K), Step<_2, _1>) + // has two top-level modes: + // Mode 0 (M): shape (32, 4) — the SfAtom's M sub-structure + // Mode 1 (K): int — the K dimension + // For mode 0, we decompose m into (inner_m, sub_m) = (m / 4, m % 4) + // For mode 1, we pass k_elem directly. + auto coord = cute::make_tuple(cute::make_tuple(m / 4, m % 4), k_elem); + dst[layout_sf(coord)] = src[src_idx]; } /////////////////////////////////////////////////////////////////////////////////////////////////