[Perf][Kernel] Optimize FP4 quantization kernels (SM100F) (#32520)

Signed-off-by: LopezCastroRoberto <rocastro@redhat.com>
2026-01-25 02:45:27 +01:00
parent 1ebdff412a
commit fcb9df99bd
18 changed files with 508 additions and 151 deletions
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -293,7 +293,8 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);

 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                      torch::Tensor& output_scale,
-                      torch::Tensor const& input_scale);
+                      torch::Tensor const& input_scale,
+                      bool is_sf_swizzled_layout);

 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,