[Kernel] Add support for block FP8 on SM120 (NVIDIA 5090 and RTX PRO 6000) (#22131)

Signed-off-by: Junhao Li <junhao@ubicloud.com>
This commit is contained in:
Junhao Li
2025-08-07 22:18:28 -04:00
committed by GitHub
parent b2c8ce57c6
commit 3303f134e0
6 changed files with 229 additions and 18 deletions

View File

@@ -47,4 +47,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales);
void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales);
} // namespace vllm