diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 120addba2..d43557a29 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -498,7 +498,9 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ - Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py) - Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py) -### Llama Nemotron Multimodal Embedding Models +### Llama Nemotron Multimodal + +#### Embedding Model Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone (from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce @@ -559,6 +561,70 @@ curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" }' ``` +#### Reranker Model + +Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP +backbone with a sequence-classification head for cross-encoder scoring and reranking. + +| Architecture | Backbone | Example HF Models | +|---|---|---| +| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` | + +Start the server: + +```shell +vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \ + --runner pooling \ + --trust-remote-code \ + --chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja +``` + +!!! note + The chat template bundled with this checkpoint's tokenizer is not suitable + for the Score/Rerank APIs. Use the provided override template when serving: + `examples/pooling/score/template/nemotron-vl-rerank.jinja`. + +Score a text query against an image document: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "nvidia/llama-nemotron-rerank-vl-1b-v2", + "data_1": "Find diagrams about autonomous robots", + "data_2": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Robotics workflow diagram."} + ] + } + ] +}' +``` + +Rerank image documents by a text query: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "nvidia/llama-nemotron-rerank-vl-1b-v2", + "query": "Find diagrams about autonomous robots", + "documents": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Robotics workflow diagram."} + ] + }, + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "General skyline photo."} + ] + } + ], + "top_n": 2 +}' +``` + ### BAAI/bge-m3 The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json` diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index eca66041d..534411c63 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -842,6 +842,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|--------|-------------------|----------------------|---------------------------| | `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ | +| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + IE+ | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | | | `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + IE+ + VE+ | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) diff --git a/examples/pooling/score/template/nemotron-vl-rerank.jinja b/examples/pooling/score/template/nemotron-vl-rerank.jinja new file mode 100644 index 000000000..25b9887b8 --- /dev/null +++ b/examples/pooling/score/template/nemotron-vl-rerank.jinja @@ -0,0 +1,15 @@ +{%- set query_msg = (messages | selectattr('role', 'equalto', 'query') | list | first) -%} +{%- set doc_msg = (messages | selectattr('role', 'equalto', 'document') | list | first) -%} + +{%- set q = query_msg['content'] -%} +{%- set d = doc_msg['content'] -%} + +{# If the doc contains anywhere, hoist a single to the front #} +{%- set has_image = ("" in d) -%} +{%- set d_clean = d | replace("", "") -%} +{%- set q_clean = q | replace("", "") -%} + +{%- if has_image -%}{{ " " }}{%- endif -%} +question:{{ q_clean }}{{ " " }} +{{ " " }} +{{ " " }}passage:{{ d_clean }} \ No newline at end of file diff --git a/tests/models/multimodal/pooling/test_llama_nemotron_vl.py b/tests/models/multimodal/pooling/test_llama_nemotron_vl.py new file mode 100644 index 000000000..84cae19ee --- /dev/null +++ b/tests/models/multimodal/pooling/test_llama_nemotron_vl.py @@ -0,0 +1,355 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for the LlamaNemotronVL model family: + - nvidia/llama-nemotron-embed-vl-1b-v2 (LlamaNemotronVLForCausalLM / embed) + - nvidia/llama-nemotron-rerank-vl-1b-v2 + (LlamaNemotronVLForSequenceClassification / rerank) + +Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone. +""" + +import base64 +from io import BytesIO +from pathlib import Path + +import pytest +import torch +from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor + +from vllm.entrypoints.chat_utils import ( + ChatCompletionContentPartImageParam, + ChatCompletionContentPartTextParam, +) +from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ...utils import check_embeddings_close + +# Prefixes used by the model API +QUERY_PREFIX = "query: " +PASSAGE_PREFIX = "passage: " + +# Text prompts for text-only embedding +HF_TEXT_PROMPTS = [ + # T -> X (text embedding queries) + f"{QUERY_PREFIX}The label of the object is stop sign", + f"{QUERY_PREFIX}cherry blossom", +] + +# Image prompts using the model's expected format +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts( + { + # I -> X (image embedding as passage/document) + "stop_sign": f"{PASSAGE_PREFIX}", + "cherry_blossom": f"{PASSAGE_PREFIX}", + } +) + +MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"] + + +def _run_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + input_texts: list[str], + input_images: PromptImageInput, + model: str, + *, + dtype: str, +) -> None: + """Run embedding comparison test between HF and vLLM. + + NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing. + """ + # Run vLLM inference first + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=2048, + enforce_eager=True, + trust_remote_code=True, + ) as vllm_model: + vllm_outputs = vllm_model.embed(input_texts, images=input_images) + + # Run HF inference using the model's encode_queries/encode_documents API + with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: + hf_outputs = [] + for text, image in zip(input_texts, input_images): + with torch.inference_mode(): + if text.startswith(QUERY_PREFIX): + # Strip prefix and use encode_queries for query texts + query_text = text[len(QUERY_PREFIX) :] + embedding = hf_model.model.encode_queries([query_text]) + elif text.startswith(PASSAGE_PREFIX): + # Strip prefix and use encode_documents for passages/images + passage_text = text[len(PASSAGE_PREFIX) :] + if image is not None: + # Image document - pass image to encode_documents + embedding = hf_model.model.encode_documents( + images=[image], + texts=[passage_text], + ) + else: + # Text-only document + embedding = hf_model.model.encode_documents( + texts=[passage_text] + ) + else: + raise ValueError( + f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'" + ) + + hf_outputs.append(embedding[0].tolist()) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + """Test text-only embedding.""" + input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + model, + dtype=dtype, + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + """Test image embedding.""" + input_texts_images = [ + (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + model, + dtype=dtype, + ) + + +# --------------------------------------------------------------------------- +# Reranker tests — nvidia/llama-nemotron-rerank-vl-1b-v2 +# --------------------------------------------------------------------------- + +RERANKER_MODELS = ["nvidia/llama-nemotron-rerank-vl-1b-v2"] + +# The tokenizer's built-in chat template is not suitable for the Score/Rerank +# APIs (it's inherited from the base LLM). We must use the provided override. +_RERANKER_SCORE_TEMPLATE = ( + Path(__file__).parents[4] + / "examples/pooling/score/template/nemotron-vl-rerank.jinja" +).read_text() + +RERANKER_TEXT_QUERY = "How is AI improving the intelligence and capabilities of robots?" +RERANKER_TEXT_DOCS = [ + "AI enables robots to perceive, plan, and act autonomously.", + ( + "A biological foundation model designed to analyze DNA, RNA, " + "and protein sequences." + ), +] + +RERANKER_IMAGE_QUERY = "photo of a red stop sign on a street" + + +def _pil_to_data_uri(image) -> str: + buf = BytesIO() + image.save(buf, format="PNG") + b64 = base64.b64encode(buf.getvalue()).decode() + return f"data:image/png;base64,{b64}" + + +def _run_hf_reranker( + hf_runner: type[HfRunner], + model: str, + dtype: str, + query: str, + docs: list, +) -> list[float]: + """Run HF reranker inference; docs is a list of (doc_text, doc_image|None).""" + with hf_runner( + model, + dtype=dtype, + trust_remote_code=True, + auto_cls=AutoModelForSequenceClassification, + ) as hf_model: + processor = AutoProcessor.from_pretrained( + model, + trust_remote_code=True, + max_input_tiles=6, + use_thumbnail=True, + rerank_max_length=2048, + ) + examples = [ + { + "question": query, + "doc_text": doc_text if doc_text is not None else "", + "doc_image": doc_image if doc_image is not None else "", + } + for doc_text, doc_image in docs + ] + batch_dict = processor.process_queries_documents_crossencoder(examples) + batch_dict = { + k: v.to(hf_model.model.device) if isinstance(v, torch.Tensor) else v + for k, v in batch_dict.items() + } + with torch.inference_mode(): + logits = hf_model.model(**batch_dict, return_dict=True).logits + # vLLM applies sigmoid activation to the raw logits before returning + # scores; apply the same here so both sides are comparable. + scores = torch.sigmoid(logits.squeeze(-1).float()) + return scores.detach().cpu().tolist() + + +def _run_vllm_reranker( + vllm_runner: type[VllmRunner], + model: str, + dtype: str, + query: str, + docs: list, +) -> list[float]: + """Run vLLM reranker inference; docs is a list of (doc_text, doc_image|None).""" + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=2048, + enforce_eager=True, + trust_remote_code=True, + ) as vllm_model: + has_images = any(img is not None for _, img in docs) + + if not has_images: + # Text-only path: use the simple string score API. + queries = [query] * len(docs) + doc_texts = [doc_text for doc_text, _ in docs] + outputs = vllm_model.score( + queries, + doc_texts, + chat_template=_RERANKER_SCORE_TEMPLATE, + ) + else: + # Multimodal path: build ScoreMultiModalParam for each pair. + query_params = [ + ScoreMultiModalParam( + content=[ + ChatCompletionContentPartTextParam( + type="text", + text=query, + ) + ] + ) + ] * len(docs) + + doc_params = [] + for doc_text, doc_image in docs: + content: list = [] + if doc_image is not None: + content.append( + ChatCompletionContentPartImageParam( + type="image_url", + image_url={"url": _pil_to_data_uri(doc_image)}, + ) + ) + if doc_text: + content.append( + ChatCompletionContentPartTextParam( + type="text", + text=doc_text, + ) + ) + doc_params.append(ScoreMultiModalParam(content=content)) + + raw_outputs = vllm_model.llm.score( + query_params, + doc_params, + chat_template=_RERANKER_SCORE_TEMPLATE, + ) + outputs = [o.outputs.score for o in raw_outputs] + + return outputs + + +def _run_reranker_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + model: str, + dtype: str, + query: str, + docs: list, +) -> None: + """Compare HF and vLLM reranker scores. + + NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing. + """ + vllm_scores = _run_vllm_reranker(vllm_runner, model, dtype, query, docs) + hf_scores = _run_hf_reranker(hf_runner, model, dtype, query, docs) + + assert len(hf_scores) == len(vllm_scores), ( + f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}" + ) + for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)): + assert hf_score == pytest.approx(vllm_score, rel=0.02), ( + f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}" + ) + + +@pytest.mark.parametrize("model", RERANKER_MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_reranker_text( + hf_runner, + vllm_runner, + model: str, + dtype: str, +) -> None: + """Test reranking with text-only query and text documents.""" + docs = [(text, None) for text in RERANKER_TEXT_DOCS] + _run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_TEXT_QUERY, docs) + + +@pytest.mark.parametrize("model", RERANKER_MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_reranker_image_doc( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + """Test reranking with text query against image documents.""" + docs = [(None, asset.pil_image) for asset in image_assets] + _run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_IMAGE_QUERY, docs) diff --git a/tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py b/tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py deleted file mode 100644 index b02d77b9b..000000000 --- a/tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py +++ /dev/null @@ -1,148 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Tests for LlamaNemotronVL embedding model (nvidia/llama-nemotron-embed-vl-1b-v2). - -This model uses SigLIP vision encoder with bidirectional LLaMA for embeddings. -""" - -import pytest -import torch -from transformers import AutoModel - -from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner -from ...utils import check_embeddings_close - -# Prefixes used by the model API -QUERY_PREFIX = "query: " -PASSAGE_PREFIX = "passage: " - -# Text prompts for text-only embedding -HF_TEXT_PROMPTS = [ - # T -> X (text embedding queries) - f"{QUERY_PREFIX}The label of the object is stop sign", - f"{QUERY_PREFIX}cherry blossom", -] - -# Image prompts using the model's expected format -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts( - { - # I -> X (image embedding as passage/document) - "stop_sign": f"{PASSAGE_PREFIX}", - "cherry_blossom": f"{PASSAGE_PREFIX}", - } -) - -MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"] - - -def _run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - input_texts: list[str], - input_images: PromptImageInput, - model: str, - *, - dtype: str, -) -> None: - """Run embedding comparison test between HF and vLLM. - - NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing. - """ - # Run vLLM inference first - with vllm_runner( - model, - runner="pooling", - dtype=dtype, - max_model_len=2048, - enforce_eager=True, - trust_remote_code=True, - ) as vllm_model: - vllm_outputs = vllm_model.embed(input_texts, images=input_images) - - # Run HF inference using the model's encode_queries/encode_documents API - with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: - hf_outputs = [] - for text, image in zip(input_texts, input_images): - with torch.inference_mode(): - if text.startswith(QUERY_PREFIX): - # Strip prefix and use encode_queries for query texts - query_text = text[len(QUERY_PREFIX) :] - embedding = hf_model.model.encode_queries([query_text]) - elif text.startswith(PASSAGE_PREFIX): - # Strip prefix and use encode_documents for passages/images - passage_text = text[len(PASSAGE_PREFIX) :] - if image is not None: - # Image document - pass image to encode_documents - embedding = hf_model.model.encode_documents( - images=[image], - texts=[passage_text], - ) - else: - # Text-only document - embedding = hf_model.model.encode_documents( - texts=[passage_text] - ) - else: - raise ValueError( - f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'" - ) - - hf_outputs.append(embedding[0].tolist()) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -def test_models_text( - hf_runner, - vllm_runner, - image_assets, - model: str, - dtype: str, -) -> None: - """Test text-only embedding.""" - input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS] - input_texts = [text for text, _ in input_texts_images] - input_images = [image for _, image in input_texts_images] - - _run_test( - hf_runner, - vllm_runner, - input_texts, - input_images, # type: ignore - model, - dtype=dtype, - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -def test_models_image( - hf_runner, - vllm_runner, - image_assets, - model: str, - dtype: str, -) -> None: - """Test image embedding.""" - input_texts_images = [ - (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) - ] - input_texts = [text for text, _ in input_texts_images] - input_images = [image for _, image in input_texts_images] - - _run_test( - hf_runner, - vllm_runner, - input_texts, - input_images, - model, - dtype=dtype, - ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 30b400e0e..08f1a14d7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -653,6 +653,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { "LlamaBidirectionalForSequenceClassification": _HfExamplesInfo( "nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True ), + "LlamaNemotronVLForSequenceClassification": _HfExamplesInfo( + "nvidia/llama-nemotron-rerank-vl-1b-v2", trust_remote_code=True + ), "ModernBertForSequenceClassification": _HfExamplesInfo( "Alibaba-NLP/gte-reranker-modernbert-base" ), diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 7de377ab7..ef241d545 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -664,6 +664,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig, "LlamaBidirectionalModel": LlamaBidirectionalConfig, "LlamaNemotronVLModel": LlamaNemotronVLConfig, + "LlamaNemotronVLForSequenceClassification": LlamaNemotronVLConfig, "NomicBertModel": NomicBertModelConfig, "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index bef083c50..b033437d6 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -7,6 +7,7 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +import math from abc import ABC from collections.abc import Iterable @@ -18,6 +19,7 @@ from transformers import AutoModel, PretrainedConfig from transformers.image_processing_utils_fast import BaseImageProcessorFast from vllm.config import VllmConfig +from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.pooler import DispatchPooler from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig @@ -42,6 +44,7 @@ from vllm.transformers_utils.repo_utils import get_hf_file_to_dict from .interfaces import ( MultiModalEmbeddings, + SupportsCrossEncoding, SupportsLoRA, SupportsMultiModal, SupportsPP, @@ -883,3 +886,57 @@ class LlamaNemotronVLForEmbedding(LlamaNemotronVLChatModel, VllmModelForPooling) """Override to use different weight mapping for SigLIP.""" loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.weight_mapper) + + +class LlamaNemotronVLForSequenceClassification( + LlamaNemotronVLForEmbedding, SupportsCrossEncoding +): + """LlamaNemotronVL model variant for sequence classification / reranking.""" + + # Reranker checkpoint places base model weights under `model.*`, + # while `score.*` remains at the top level. + weight_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) | ( + LlamaNemotronVLForEmbedding.weight_mapper + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix) + + text_config = vllm_config.model_config.hf_config.get_text_config() + model_config = vllm_config.model_config + quant_config = vllm_config.quant_config + + self.score = ReplicatedLinear( + model_config.get_hidden_size(), + text_config.num_labels, + bias=False, + params_dtype=model_config.head_dtype, + quant_config=quant_config, + return_bias=False, + prefix=maybe_prefix(prefix, "score"), + ) + + pooler_config = model_config.pooler_config + assert pooler_config is not None + self.pooler = DispatchPooler.for_seq_cls(pooler_config, classifier=self.score) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loaded_weights = super().load_weights(weights) + + # reranker checkpoint omits the inner LM seq-cls head + # (`language_model.score.*`). It is unused by this outer model, but + # the default loader expects all parameters to be initialized. + for name, param in self.named_parameters(): + if not name.startswith("language_model.score.") or name in loaded_weights: + continue + + if name.endswith(".weight"): + torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5)) + elif name.endswith(".bias"): + torch.nn.init.zeros_(param) + else: + torch.nn.init.normal_(param, mean=0.0, std=0.02) + + loaded_weights.add(name) + + return loaded_weights diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 97937e886..7f6b7e300 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -284,6 +284,10 @@ _CROSS_ENCODER_MODELS = { "llama", "LlamaBidirectionalForSequenceClassification", ), + "LlamaNemotronVLForSequenceClassification": ( + "nemotron_vl", + "LlamaNemotronVLForSequenceClassification", + ), "ModernBertForSequenceClassification": ( "modernbert", "ModernBertForSequenceClassification",