[BugFix] Fix fp4 quant kernel on CUDA 12.8 (#35210)

Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
2026-02-26 03:32:50 +01:00
parent 160424a937
commit 86c3b5a808
2 changed files with 11 additions and 7 deletions
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -107,7 +107,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
        } else {
-          out[inOffset] = out_val;
+          int64_t outOffset =
+              rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+          out[outOffset] = out_val;
        }
      }
    }
@@ -140,7 +142,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
  int const numBlocksPerSM =
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));

-  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
+  int sf_n_unpadded = int(n / CVT_FP4_ELTS_PER_THREAD);

  int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
  int grid_x = std::min(