From f6fd549800eb5fa374027a411dce9e442d9d6592 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 15 May 2026 21:19:58 +0000 Subject: [PATCH] fix: restore col_major_src handling for SFB source layout SFB scales arrive as (K_sf, N) row-major after transpose+contiguous in weight_transform.py. The col_major_src flag correctly describes this. Don't assume both sources are (MN, K_sf). --- .../cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu index 5ae26fe5..dc7c25de 100644 --- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu +++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu @@ -135,18 +135,18 @@ __global__ void remap_sf_to_cutlass_kernel( int mn, k_sf_val, src_idx; if (col_major_src) { - // source is row-major (K_sf, MN), e.g. SFB stored as (K_sf, N) + // Source is (K_sf, MN) row-major in memory — e.g. SFB after transpose+contiguous k_sf_val = tid / MN; mn = tid % MN; - src_idx = tid; + src_idx = tid; // tid = k_sf_val * MN + mn } else { - // source is row-major (MN, K_sf), e.g. SFA stored as (M, K_sf) + // Source is (MN, K_sf) row-major — e.g. SFA mn = tid / K_sf; k_sf_val = tid % K_sf; - src_idx = tid; + src_idx = tid; // tid = mn * K_sf + k_sf_val } - // Use layout forward mapping: source (mn, k_sf*16) -> dst_idx + // Use layout forward mapping: (mn, k_sf*16) -> dst_idx constexpr int LayoutRank = cute::rank_v; int dst_idx = 0;