Support CUTLASS NVFP4 (w4a4) for Blackwell Geforce GPUs (SM120) (#21309)

Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
2025-08-03 09:54:22 +02:00
parent 3f36c325fa
commit 789562c28c
6 changed files with 329 additions and 13 deletions
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -332,7 +332,7 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
                                    int multiProcessorCount,
                                    cudaStream_t stream);

-void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
                             torch::Tensor const& input_sf) {