[Kernel] Dynamic Per-Token Activation Quantization (#5037)

Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-07 12:36:26 -04:00
parent dc49fb892c
commit ca3ea51bde
12 changed files with 439 additions and 75 deletions
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -70,6 +70,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
          "Compute int8 quantized tensor for given scaling factor");

+  ops.def("dynamic_scaled_int8_quant", &dynamic_scaled_int8_quant,
+          "Compute int8 quantized tensor and scaling factor");
+
  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
  cache_ops.def("swap_blocks", &swap_blocks,