[Quantization][Refactor] Move CPU GPTQ kernel into MP linear (#31801)

Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Li, Jiang <bigpyj64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-07 03:10:18 +08:00
parent c07163663d
commit 8becf146bd
9 changed files with 171 additions and 332 deletions
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -42,6 +42,9 @@ def query_marlin_supported_quant_types(
    include_fp_type: bool = True,
    device_capability: int | None = None,
 ):
+    if current_platform.is_cpu():
+        return _query_cpu_marlin_supported_quant_types(has_zp, include_fp_type)
+
    if device_capability is None:
        capability_tuple = current_platform.get_device_capability()
        device_capability = (
@@ -74,6 +77,33 @@ def query_marlin_supported_quant_types(
        return res


+def _query_cpu_marlin_supported_quant_types(
+    has_zp: bool | None = None,
+    include_fp_type: bool = True,
+):
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = _query_cpu_marlin_supported_quant_types(
+            False,
+            include_fp_type,
+        )
+        types1 = _query_cpu_marlin_supported_quant_types(
+            True,
+            include_fp_type,
+        )
+        return types0 + types1
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
+    else:
+        # GPTQ style, unsigned + symmetric bias, only supports 4-bits for now
+        res = [scalar_types.uint4b8]
+        return res
+
+
 def _check_marlin_supported(
    quant_type: ScalarType,
    group_size: int | None,