csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu

#include "c3x/scaled_mm_helper.hpp"
#include "c3x/scaled_mm_kernels.hpp"

/*
   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
   NVIDIA GPUs with sm90a (Hopper).
*/

#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90

void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales,
                            std::optional<torch::Tensor> const& bias) {
  dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
                     vllm::cutlass_scaled_mm_sm90_fp8,
                     vllm::cutlass_scaled_mm_sm90_int8,
                     vllm::cutlass_scaled_mm_blockwise_sm90_fp8);
}

void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
                                torch::Tensor const& b,
                                torch::Tensor const& a_scales,
                                torch::Tensor const& b_scales,
                                torch::Tensor const& azp_adj,
                                std::optional<torch::Tensor> const& azp,
                                std::optional<torch::Tensor> const& bias) {
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

  vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
                                        azp, bias);
}

#endif
Add cutlass support for blackwell fp8 blockwise gemm (#14383) Signed-off-by: Shu Wang <shuw@nvidia.com> 2025-05-08 17:09:55 -05:00			`#include "c3x/scaled_mm_helper.hpp"`
[Kernel] Update `cutlass_scaled_mm` to support 2d group (blockwise) scaling (#11868) 2025-01-30 21:33:00 -05:00			`#include "c3x/scaled_mm_kernels.hpp"`
[Build] Guard against older CUDA versions when building CUTLASS 3.x kernels (#5168) 2024-05-31 20:21:38 -04:00
[Kernel] Add w8a8 CUTLASS kernels (#4749) 2024-05-16 18:32:50 -04:00			`/*`
[Kernel] Factor out epilogues from cutlass kernels (#5391) Co-authored-by: Michael Goin <michael@neuralmagic.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: zifeitong <zifei.tong@parasail.io> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> 2024-06-13 14:22:19 -04:00			`This file defines quantized GEMM operations using the CUTLASS 3.x API, for`
[Build/BugFix] Fix hopper 12.8 build (#14354) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2025-03-08 03:11:56 -05:00			`NVIDIA GPUs with sm90a (Hopper).`
[Kernel] Add w8a8 CUTLASS kernels (#4749) 2024-05-16 18:32:50 -04:00			`*/`

[Build/BugFix] Fix hopper 12.8 build (#14354) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2025-03-08 03:11:56 -05:00			`#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90`

[Kernel] Adding bias epilogue support for `cutlass_scaled_mm` (#5560) Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> 2024-06-26 11:16:00 -04:00			`void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,`
			`torch::Tensor const& b,`
			`torch::Tensor const& a_scales,`
			`torch::Tensor const& b_scales,`
[MISC] Replace c10::optional with std::optional (#11730) Signed-off-by: Lu Fang <lufang@fb.com> 2025-01-04 17:20:34 -08:00			`std::optional<torch::Tensor> const& bias) {`
Add cutlass support for blackwell fp8 blockwise gemm (#14383) Signed-off-by: Shu Wang <shuw@nvidia.com> 2025-05-08 17:09:55 -05:00			`dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,`
			`vllm::cutlass_scaled_mm_sm90_fp8,`
			`vllm::cutlass_scaled_mm_sm90_int8,`
			`vllm::cutlass_scaled_mm_blockwise_sm90_fp8);`
[Kernel] Adding bias epilogue support for `cutlass_scaled_mm` (#5560) Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> 2024-06-26 11:16:00 -04:00			`}`

[Kernel] Add per-tensor and per-token AZP epilogues (#5941) Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2024-08-06 14:17:08 -04:00			`void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,`
			`torch::Tensor const& b,`
			`torch::Tensor const& a_scales,`
			`torch::Tensor const& b_scales,`
			`torch::Tensor const& azp_adj,`
[MISC] Replace c10::optional with std::optional (#11730) Signed-off-by: Lu Fang <lufang@fb.com> 2025-01-04 17:20:34 -08:00			`std::optional<torch::Tensor> const& azp,`
			`std::optional<torch::Tensor> const& bias) {`
[Kernel] Add per-tensor and per-token AZP epilogues (#5941) Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2024-08-06 14:17:08 -04:00			`TORCH_CHECK(a_scales.dtype() == torch::kFloat32);`
			`TORCH_CHECK(b_scales.dtype() == torch::kFloat32);`

[Kernel] Update `cutlass_scaled_mm` to support 2d group (blockwise) scaling (#11868) 2025-01-30 21:33:00 -05:00			`vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,`
			`azp, bias);`
[Kernel] Add per-tensor and per-token AZP epilogues (#5941) Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2024-08-06 14:17:08 -04:00			`}`
add cutlass support for blackwell fp8 gemm (#13798) 2025-03-04 07:55:07 -08:00
			`#endif`