[Perf] Cuda Kernel for Int8 Per Token Group Quant (#21476)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-25 20:07:07 -04:00
parent 41d3082c41
commit 75d29cf4e1
6 changed files with 47 additions and 3 deletions
--- a/csrc/quantization/per_token_group_quant_8bit.h
+++ b/csrc/quantization/per_token_group_quant_8bit.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <torch/all.h>
+
+// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
+// 8-bit per-token-group quantization helper used by both FP8 and INT8
+void per_token_group_quant_8bit(const torch::Tensor& input,
+                                torch::Tensor& output_q,
+                                torch::Tensor& output_s, int64_t group_size,
+                                double eps, double min_8bit, double max_8bit,
+                                bool scale_ue8m0 = false);