diff --git a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu index 5ae26fe5..dc7c25de 100644 --- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu +++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu @@ -135,18 +135,18 @@ __global__ void remap_sf_to_cutlass_kernel( int mn, k_sf_val, src_idx; if (col_major_src) { - // source is row-major (K_sf, MN), e.g. SFB stored as (K_sf, N) + // Source is (K_sf, MN) row-major in memory — e.g. SFB after transpose+contiguous k_sf_val = tid / MN; mn = tid % MN; - src_idx = tid; + src_idx = tid; // tid = k_sf_val * MN + mn } else { - // source is row-major (MN, K_sf), e.g. SFA stored as (M, K_sf) + // Source is (MN, K_sf) row-major — e.g. SFA mn = tid / K_sf; k_sf_val = tid % K_sf; - src_idx = tid; + src_idx = tid; // tid = mn * K_sf + k_sf_val } - // Use layout forward mapping: source (mn, k_sf*16) -> dst_idx + // Use layout forward mapping: (mn, k_sf*16) -> dst_idx constexpr int LayoutRank = cute::rank_v; int dst_idx = 0;