[Kernel] Add GPTQv2 format support for low-bit or asymmetric quantization, by adapting gptq_gemm (#26092)

This commit is contained in:
Xiangyu Li
2025-10-24 11:26:13 +08:00
committed by GitHub
parent 1f9460c4c1
commit 5cc6bddb6e
8 changed files with 295 additions and 98 deletions

View File

@@ -557,7 +557,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// to prevent the meta function registry.
ops.def(
"gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
"use_v2_format, int bit) "
"-> Tensor",
{stride_tag});
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);