From 2ac3a7d63103e4023034f30980f4bd057b3084ae Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 14 May 2026 15:32:12 +0000 Subject: [PATCH] fix: construct nested coordinate for CuTe layout shape ((32,4), K) layout_sf(m, k_elem) with flat ints fails: Mismatched Ranks because the layout shape is ((32,4), K_padded), not (M, K). Decompose m into (inner_m, sub_m) = (m/4, m%4) to match the (32,4) sub-shape, and pass as make_tuple(make_tuple(inner, sub), k_elem). --- .../cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu index fb3e05cf..bbed2662 100644 --- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu +++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu @@ -122,7 +122,15 @@ __global__ void remap_sf_to_cutlass_kernel( int k_group = src_idx % K_sf; int k_elem = k_group * SFVecSize; - dst[layout_sf(m, k_elem)] = src[src_idx]; + // Construct a coordinate matching the layout's top-level mode structure. + // The layout shape from tile_to_shape(SfAtom{}, make_shape(MN, K), Step<_2, _1>) + // has two top-level modes: + // Mode 0 (M): shape (32, 4) — the SfAtom's M sub-structure + // Mode 1 (K): int — the K dimension + // For mode 0, we decompose m into (inner_m, sub_m) = (m / 4, m % 4) + // For mode 1, we pass k_elem directly. + auto coord = cute::make_tuple(cute::make_tuple(m / 4, m % 4), k_elem); + dst[layout_sf(coord)] = src[src_idx]; } /////////////////////////////////////////////////////////////////////////////////////////////////