diff --git a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu index 9a7cf49f..c5852e5d 100644 --- a/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu +++ b/src/nvfp4_megamoe_kernel/cutlass_nvfp4_gemm/cutlass_nvfp4_gemm.cu @@ -136,7 +136,7 @@ __global__ void remap_sf_to_cutlass_kernel( // This avoids garbled output from concurrent threads if (threadIdx.x == 0 && blockIdx.x == 0) { // Print a series of indices to understand the decomposition - int debug_indices[] = {0, 1, 2, 3, 4, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256, 511, 512, 512*448-1}; + int debug_indices[] = {0, 1, 4, 16, 64, 128, 256, 511, 512, 513, 516, 1024, 2048, 4096, 8192, 65536, 131072}; int n_debug = sizeof(debug_indices) / sizeof(debug_indices[0]); for (int di = 0; di < n_debug; di++) { int didx = debug_indices[di];