[Hardware][CPU] Support AWQ for CPU backend (#7515)

This commit is contained in:
Li, Jiang
2024-10-10 00:28:08 +08:00
committed by GitHub
parent 7dea289066
commit ca77dd7a44
9 changed files with 214 additions and 7 deletions

View File

@@ -20,6 +20,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
PackedvLLMParameter)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
logger = init_logger(__name__)
@@ -123,6 +124,9 @@ class AWQMarlinConfig(QuantizationConfig):
group_size = quant_config.get("group_size")
has_zp = quant_config.get("zero_point")
if not current_platform.is_cuda():
return False
if quant_method != "awq":
return False