Support CUTLASS NVFP4 (w4a4) for Blackwell Geforce GPUs (SM120) (#21309)

Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
This commit is contained in:
Roberto L. Castro
2025-08-03 09:54:22 +02:00
committed by GitHub
parent 3f36c325fa
commit 789562c28c
6 changed files with 329 additions and 13 deletions

View File

@@ -332,7 +332,7 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
int multiProcessorCount,
cudaStream_t stream);
void scaled_fp4_quant_sm100a(torch::Tensor const& output,
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
torch::Tensor const& input,
torch::Tensor const& output_sf,
torch::Tensor const& input_sf) {