[Bugfix] Fix prefix strings for quantized VLMs (#9772)

2024-10-29 19:02:59 -04:00
parent 8d7724104a
commit bc73e9821c
20 changed files with 288 additions and 97 deletions
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -55,7 +55,8 @@ from vllm.sequence import IntermediateTensors, PoolerOutput

 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)


 class LlamaMLP(nn.Module):
@@ -500,6 +501,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
    ) -> None:
        super().__init__()

@@ -510,7 +512,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                                cache_config,
                                quant_config,
                                lora_config=lora_config,
-                                prefix="model")
+                                prefix=maybe_prefix(prefix, "model"))
        if get_pp_group().is_last_rank:
            self.unpadded_vocab_size = config.vocab_size
            if lora_config:
@@ -526,6 +528,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                    if not lora_config else
                    lora_config.lora_vocab_padding_size),
                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
            )
            if config.tie_word_embeddings:
                self.lm_head = self.lm_head.tie_weights(