[Kernel] Add GPTQv2 format support for low-bit or asymmetric quantization, by adapting gptq_gemm (#26092)

2025-10-24 11:26:13 +08:00
parent 1f9460c4c1
commit 5cc6bddb6e
8 changed files with 295 additions and 98 deletions
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -145,10 +145,15 @@ class ExllamaLinearKernel(MPLinearKernel):

        w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)

+        # gptq_gemm supports GPTQv2 format by passing use_v2_format=True.
+        # However, the MPLinearLayerConfig doesn't contain format info.
+        # So hardcode GPTQv1 format here, to keep its behavior unchanged.
+        use_v2_format = False
+
        assert w_zp is not None, "Zero points are required by Exllama"
        assert w_g_idx is not None, "Group index is required by Exllama"
        output = ops.gptq_gemm(
-            x_2d, w_q, w_zp, w_s, w_g_idx, True, c.weight_type.size_bits
+            x_2d, w_q, w_zp, w_s, w_g_idx, True, use_v2_format, c.weight_type.size_bits
        )

        if bias is not None: