[Quantization][Autoround][XPU] Add W4A16 Support (#37986)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
This commit is contained in:
Yi Liu
2026-04-01 00:48:24 +08:00
committed by GitHub
parent 3896e021a0
commit 0dd25a44ea
2 changed files with 165 additions and 11 deletions

View File

@@ -42,6 +42,7 @@ docker run \
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
python3 examples/basic/offline_inference/generate.py --model OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc --block-size 64 --enforce-eager --max-model-len 8192
cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine

View File

@@ -6,14 +6,24 @@ from typing import TYPE_CHECKING, Any
import regex as re
import torch
from torch.nn.parameter import Parameter
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
from vllm.model_executor.layers.linear import (
LinearBase,
LinearMethodBase,
UnquantizedLinearMethod,
)
from vllm.model_executor.layers.quantization import (
QuantizationConfig,
QuantizationMethods,
)
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.parameter import (
GroupQuantScaleParameter,
PackedvLLMParameter,
RowvLLMParameter,
)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
@@ -402,16 +412,30 @@ class INCConfig(QuantizationConfig):
return None
def apply_ipex_quant_layer(self, layer, prefix: str):
def apply_xpu_w4a16_quant_layer(self, layer, prefix: str):
weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
if not self.check_quantized(weight_bits):
if isinstance(layer, (LinearBase, ParallelLMHead)):
return UnquantizedLinearMethod()
else:
return None
raise NotImplementedError(
"INC quantization is not supported during xpu kernel migration."
)
if weight_bits != 4:
raise NotImplementedError(
f"INC on XPU only supports 4-bit quantization, "
f"got weight_bits={weight_bits}."
)
if not sym:
raise NotImplementedError(
"INC W4A16 on XPU only supports symmetric quantization for now."
)
if isinstance(layer, (LinearBase, ParallelLMHead)):
return INCXPULinearMethod(
weight_bits=weight_bits,
group_size=group_size,
sym=sym,
)
return None
def get_quant_method(self, layer: torch.nn.Module, prefix: str):
if prefix and self.extra_config:
@@ -420,12 +444,8 @@ class INCConfig(QuantizationConfig):
layer_name == prefix or layer_name == f"model.{prefix}"
) and self.extra_config[layer_name].get("bits", 16) >= 16:
return UnquantizedLinearMethod()
if (
current_platform.is_cpu()
or current_platform.is_xpu()
or self.backend == "ipex"
):
return self.apply_ipex_quant_layer(layer, prefix)
if current_platform.is_xpu():
return self.apply_xpu_w4a16_quant_layer(layer, prefix)
if "gptq" in self.packing_format or "gptq" in self.backend:
return self.apply_gptq_quant_layer(layer, prefix)
if "awq" in self.packing_format or "awq" in self.backend:
@@ -440,3 +460,136 @@ class INCConfig(QuantizationConfig):
if is_auto_round_format:
return cls.get_name()
return None
class INCXPULinearMethod(LinearMethodBase):
"""XPU linear method for INC w4a16 GPTQ quantization (symmetric only).
Repacks GPTQ weights from [in_packed, out] to oneDNN [out, in_packed]
layout and calls torch.ops._xpu_C.int4_gemm_w4a16.
GPTQ format: qweight [in_packed, out] with sequential nibble order.
Note: Asymmetric quantization (sym=false) is not for now.
FIXME(yiliu30): Refine the implementation to reuse XPUwNa16LinearKernel.
"""
def __init__(self, weight_bits: int, group_size: int, sym: bool):
self.weight_bits = weight_bits
self.group_size = group_size
self.sym = sym
self.pack_factor = 32 // weight_bits
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: list[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
del output_size # Unused.
output_size_per_partition = sum(output_partition_sizes)
weight_loader = extra_weight_attrs.get("weight_loader")
scales_and_zp_size = input_size_per_partition // self.group_size
# GPTQ: qweight [in // pack_factor, out] packed along input dim
qweight = PackedvLLMParameter(
data=torch.empty(
input_size_per_partition // self.pack_factor,
output_size_per_partition,
dtype=torch.int32,
),
input_dim=0,
output_dim=1,
packed_dim=0,
packed_factor=self.pack_factor,
weight_loader=weight_loader,
)
# scales: [num_groups, out] params_dtype
scales = GroupQuantScaleParameter(
data=torch.empty(
scales_and_zp_size,
output_size_per_partition,
dtype=params_dtype,
),
input_dim=0,
output_dim=1,
weight_loader=weight_loader,
)
# qzeros: [num_groups, out // pack_factor] int32
qzeros = PackedvLLMParameter(
data=torch.empty(
scales_and_zp_size,
output_size_per_partition // self.pack_factor,
dtype=torch.int32,
),
input_dim=0,
output_dim=1,
packed_dim=1,
packed_factor=self.pack_factor,
weight_loader=weight_loader,
)
layer.register_parameter("qweight", qweight)
layer.register_parameter("scales", scales)
layer.register_parameter("qzeros", qzeros)
# GPTQ checkpoints may include g_idx for activation reordering.
# Register it so the weight loader doesn't error on unexpected keys.
g_idx = RowvLLMParameter(
data=torch.tensor(
[i // self.group_size for i in range(input_size_per_partition)],
dtype=torch.int32,
),
input_dim=0,
weight_loader=weight_loader,
)
layer.register_parameter("g_idx", g_idx)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
"""Repack GPTQ weights into kernel-ready NT layout."""
device = layer.qweight.data.device
# oneDNN int4 kernel requires strides[0]==1 ("NT format"), but GPTQ
# checkpoint is [K_packed, N] contiguous with strides (N, 1).
# Two transposes are needed — neither alone can achieve this:
# 1. .t().contiguous() → [N, K_packed] contiguous in memory
# 2. .t() → [K_packed, N] view with strides (1, K_packed)
# The result has the same logical shape but strides[0]==1 as required.
qweight_ct = layer.qweight.data.t().contiguous()
layer.qweight = Parameter(qweight_ct.t(), requires_grad=False)
# Scales: [num_groups, out] — no change needed
layer.scales = Parameter(layer.scales.data, requires_grad=False)
# Symmetric: GPTQ v1 stores qzeros=7, effective zp = 7+1 = 8
# Kernel expects int8 scalar = 8
layer.qzeros = Parameter(
torch.tensor([8], dtype=torch.int8, device=device),
requires_grad=False,
)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
# qweight is already in NT layout [K_packed, N] (strides (1, K_packed))
# from process_weights_after_loading — pass directly to kernel.
out_shape = x.shape[:-1] + (layer.qweight.shape[1],)
reshaped_x = x.reshape(-1, x.shape[-1])
out = torch.ops._xpu_C.int4_gemm_w4a16(
reshaped_x,
layer.qweight,
bias,
layer.scales,
layer.qzeros,
self.group_size,
None, # g_idx not needed: desc_act is always False for INC models
)
return out.reshape(out_shape)