[Quantization][Refactor] Move CPU GPTQ kernel into MP linear (#31801)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Li, Jiang <bigpyj64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Li, Jiang
2026-01-07 03:10:18 +08:00
committed by GitHub
parent c07163663d
commit 8becf146bd
9 changed files with 171 additions and 332 deletions

View File

@@ -42,6 +42,9 @@ def query_marlin_supported_quant_types(
include_fp_type: bool = True,
device_capability: int | None = None,
):
if current_platform.is_cpu():
return _query_cpu_marlin_supported_quant_types(has_zp, include_fp_type)
if device_capability is None:
capability_tuple = current_platform.get_device_capability()
device_capability = (
@@ -74,6 +77,33 @@ def query_marlin_supported_quant_types(
return res
def _query_cpu_marlin_supported_quant_types(
has_zp: bool | None = None,
include_fp_type: bool = True,
):
# - has_zp is True: return quant_types that has zero points
# - has_zp is False: return quant_types that has not zero points
# - has_zp is None: both
if has_zp is None:
types0 = _query_cpu_marlin_supported_quant_types(
False,
include_fp_type,
)
types1 = _query_cpu_marlin_supported_quant_types(
True,
include_fp_type,
)
return types0 + types1
if has_zp:
# AWQ style, unsigned + runtime zero-point
return [scalar_types.uint4]
else:
# GPTQ style, unsigned + symmetric bias, only supports 4-bits for now
res = [scalar_types.uint4b8]
return res
def _check_marlin_supported(
quant_type: ScalarType,
group_size: int | None,