diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 8c10c6ddc..467e8ab67 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -288,15 +288,37 @@ def as_seq_cls_model(cls: _T) -> _T: vllm_config: "VllmConfig", prefix: str = "", ) -> "Pooler": - text_config = vllm_config.model_config.hf_config.get_text_config() + hf_config = vllm_config.model_config.hf_config + text_config = hf_config.get_text_config() model_config = vllm_config.model_config - quant_config = vllm_config.quant_config + + # Check if score weights are derived online from LM head + # (same condition as load_weights branch) + tokens = getattr( + hf_config, + "classifier_from_token", + getattr(text_config, "classifier_from_token", None), + ) + method = getattr( + hf_config, + "method", + getattr(text_config, "method", None), + ) + + # Online conversion: no score weights in checkpoint, don't + # quantize (small output_dim breaks FP8/Marlin tile alignment). + # Checkpoint-based: respect the model's quant_config. + quant_config = ( + None + if (tokens is not None or method is not None) + else vllm_config.quant_config + ) self.score = ReplicatedLinear( model_config.get_hidden_size(), text_config.num_labels, bias=False, - params_dtype=vllm_config.model_config.head_dtype, + params_dtype=model_config.head_dtype, quant_config=quant_config, return_bias=False, prefix=maybe_prefix(prefix, "score"), @@ -452,7 +474,6 @@ def load_weights_using_from_2_way_softmax( from vllm.model_executor.model_loader.weight_utils import default_weight_loader model_config = model.vllm_config.model_config - quant_config = model.vllm_config.quant_config hf_config = model.config text_config = hf_config.get_text_config() @@ -469,7 +490,8 @@ def load_weights_using_from_2_way_softmax( using_vlm_head = is_vlm and hasattr(language_model, "score") language_model.lm_head = ParallelLMHead( - text_config.vocab_size, text_config.hidden_size, quant_config=quant_config + text_config.vocab_size, + text_config.hidden_size, ) if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not @@ -531,7 +553,6 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.model_executor.model_loader.weight_utils import default_weight_loader model_config = model.vllm_config.model_config - quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() tokens = getattr(text_config, "classifier_from_token", []) @@ -543,7 +564,8 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te using_vlm_head = is_vlm and hasattr(language_model, "score") language_model.lm_head = ParallelLMHead( - text_config.vocab_size, text_config.hidden_size, quant_config=quant_config + text_config.vocab_size, + text_config.hidden_size, ) if text_config.tie_word_embeddings: # embed_tokens is the assumed name for input embeddings. If the model does not