[Kernel] Dynamic Per-Token Activation Quantization (#5037)
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
@@ -70,6 +70,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
|
||||
"Compute int8 quantized tensor for given scaling factor");
|
||||
|
||||
ops.def("dynamic_scaled_int8_quant", &dynamic_scaled_int8_quant,
|
||||
"Compute int8 quantized tensor and scaling factor");
|
||||
|
||||
// Cache ops
|
||||
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
|
||||
cache_ops.def("swap_blocks", &swap_blocks,
|
||||
|
||||
Reference in New Issue
Block a user