From c3841983a08ae6ba3a6dfc200c3b74fb8a4d553f Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 15 May 2026 18:52:23 +0000 Subject: [PATCH] fix: SF remap uses cute::cosize() instead of cute::size() The comment explicitly warned about this: allocation uses cosize (physical size including tile padding) but the iteration bound used size (logical size). This meant padding positions in the CUTLASS SF layout were never written, leaving them as zero instead of their actual SF values. With uniform data (all-ones), all SF values are the same so the bug was invisible. With random data, different SF values are needed at different positions and the missing writes corrupt the result. --- .../cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu index da9196b2..b880cfb4 100644 --- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu +++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu @@ -125,7 +125,7 @@ __global__ void remap_sf_to_cutlass_kernel( bool col_major_src = false // true if source is (K_sf, MN) row-major ) { int dst_idx = blockIdx.x * blockDim.x + threadIdx.x; - int total = cute::size(layout_sf); + int total = cute::cosize(layout_sf); if (dst_idx >= total) return; auto coord = cute::idx2crd(dst_idx, layout_sf.shape(), layout_sf.stride());