[Model][Quant] Fix GLM, Fix fused module mappings for quantization (#12634)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
Kyle Sayers
2025-02-05 00:32:06 -05:00
committed by GitHub
parent 686006a220
commit 7ff7a638b6
12 changed files with 194 additions and 150 deletions

View File

@@ -1135,6 +1135,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
"""
# Ensure that the LoRA support check passes when the class is not
# initialized, but set all these attributes to empty.
# These will be updated when an instance class is selected
packed_modules_mapping = {}
supported_lora_modules = []
embedding_modules = {}
@@ -1146,9 +1147,18 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
prefix: str = "",
) -> QWenBaseModel:
config = vllm_config.model_config.hf_config
# Initialize VL
if hasattr(config, "visual"):
return QWenVL(vllm_config=vllm_config, prefix=prefix)
if hasattr(config, "visual"): # noqa: SIM108
instance_cls = QWenVL
# Initialize LLM
else:
return QWenLLM(vllm_config=vllm_config, prefix=prefix)
instance_cls = QWenLLM
# quant_config references base class members,
# so update values before init is called
cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
cls.supported_lora_modules += instance_cls.supported_lora_modules
cls.embedding_modules.update(instance_cls.embedding_modules)
cls.embedding_padding_modules += instance_cls.embedding_padding_modules
return instance_cls(vllm_config=vllm_config, prefix=prefix)