[Misc][Quark] Upstream Quark format to VLLM (#10765)

Signed-off-by: kewang-xlnx <kewang@xilinx.com>
Signed-off-by: kewang2 <kewang2@amd.com>
Co-authored-by: kewang2 <kewang2@amd.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
kewang-xlnx
2025-01-16 00:05:15 +08:00
committed by GitHub
parent 5ecf3e0aaf
commit de0526f668
32 changed files with 1264 additions and 70 deletions

View File

@@ -39,8 +39,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
get_compressed_tensors_cache_scale)
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -371,6 +369,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self.config = config
self.lora_config = lora_config
self.quant_config = quant_config
self.model = GraniteModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"))
@@ -474,12 +473,15 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# processed with quantization, LoRA, fine-tuning, etc.
if self.config.tie_word_embeddings and "lm_head.weight" in name:
continue
if scale_name := get_compressed_tensors_cache_scale(name):
# Loading kv cache scales for compressed-tensors quantization
if (self.quant_config is not None and
(scale_name := self.quant_config.get_cache_scale(name))):
# Loading kv cache scales for quark and
# compressed-tensors quantization
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
loaded_weight = loaded_weight[0]
loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
loaded_weight[0])
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue