[Quantization][Refactor] Move CPU GPTQ kernel into MP linear (#31801)
Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: Li, Jiang <bigpyj64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -42,6 +42,9 @@ def query_marlin_supported_quant_types(
|
||||
include_fp_type: bool = True,
|
||||
device_capability: int | None = None,
|
||||
):
|
||||
if current_platform.is_cpu():
|
||||
return _query_cpu_marlin_supported_quant_types(has_zp, include_fp_type)
|
||||
|
||||
if device_capability is None:
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
device_capability = (
|
||||
@@ -74,6 +77,33 @@ def query_marlin_supported_quant_types(
|
||||
return res
|
||||
|
||||
|
||||
def _query_cpu_marlin_supported_quant_types(
|
||||
has_zp: bool | None = None,
|
||||
include_fp_type: bool = True,
|
||||
):
|
||||
# - has_zp is True: return quant_types that has zero points
|
||||
# - has_zp is False: return quant_types that has not zero points
|
||||
# - has_zp is None: both
|
||||
if has_zp is None:
|
||||
types0 = _query_cpu_marlin_supported_quant_types(
|
||||
False,
|
||||
include_fp_type,
|
||||
)
|
||||
types1 = _query_cpu_marlin_supported_quant_types(
|
||||
True,
|
||||
include_fp_type,
|
||||
)
|
||||
return types0 + types1
|
||||
|
||||
if has_zp:
|
||||
# AWQ style, unsigned + runtime zero-point
|
||||
return [scalar_types.uint4]
|
||||
else:
|
||||
# GPTQ style, unsigned + symmetric bias, only supports 4-bits for now
|
||||
res = [scalar_types.uint4b8]
|
||||
return res
|
||||
|
||||
|
||||
def _check_marlin_supported(
|
||||
quant_type: ScalarType,
|
||||
group_size: int | None,
|
||||
|
||||
Reference in New Issue
Block a user