[Model] Add support for nvidia/llama-nemotron-rerank-vl-1b-v2 (#35735)
Signed-off-by: Jakub Zakrzewski <jzakrzewski@nvidia.com>
This commit is contained in:
@@ -498,7 +498,9 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
|
||||
- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
|
||||
|
||||
### Llama Nemotron Multimodal Embedding Models
|
||||
### Llama Nemotron Multimodal
|
||||
|
||||
#### Embedding Model
|
||||
|
||||
Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
|
||||
(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
|
||||
@@ -559,6 +561,70 @@ curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json"
|
||||
}'
|
||||
```
|
||||
|
||||
#### Reranker Model
|
||||
|
||||
Llama Nemotron VL reranker models combine the same bidirectional Llama + SigLIP
|
||||
backbone with a sequence-classification head for cross-encoder scoring and reranking.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
|---|---|---|
|
||||
| `LlamaNemotronVLForSequenceClassification` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-rerank-vl-1b-v2` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve nvidia/llama-nemotron-rerank-vl-1b-v2 \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--chat-template examples/pooling/score/template/nemotron-vl-rerank.jinja
|
||||
```
|
||||
|
||||
!!! note
|
||||
The chat template bundled with this checkpoint's tokenizer is not suitable
|
||||
for the Score/Rerank APIs. Use the provided override template when serving:
|
||||
`examples/pooling/score/template/nemotron-vl-rerank.jinja`.
|
||||
|
||||
Score a text query against an image document:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
|
||||
"data_1": "Find diagrams about autonomous robots",
|
||||
"data_2": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Robotics workflow diagram."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Rerank image documents by a text query:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/llama-nemotron-rerank-vl-1b-v2",
|
||||
"query": "Find diagrams about autonomous robots",
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_1>"}},
|
||||
{"type": "text", "text": "Robotics workflow diagram."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64_2>"}},
|
||||
{"type": "text", "text": "General skyline photo."}
|
||||
]
|
||||
}
|
||||
],
|
||||
"top_n": 2
|
||||
}'
|
||||
```
|
||||
|
||||
### BAAI/bge-m3
|
||||
|
||||
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
|
||||
|
||||
@@ -842,6 +842,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaNemotronVLForSequenceClassification` | Llama Nemotron Reranker + SigLIP | T + I<sup>E+</sup> | `nvidia/llama-nemotron-rerank-vl-1b-v2` | | |
|
||||
| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
|
||||
15
examples/pooling/score/template/nemotron-vl-rerank.jinja
Normal file
15
examples/pooling/score/template/nemotron-vl-rerank.jinja
Normal file
@@ -0,0 +1,15 @@
|
||||
{%- set query_msg = (messages | selectattr('role', 'equalto', 'query') | list | first) -%}
|
||||
{%- set doc_msg = (messages | selectattr('role', 'equalto', 'document') | list | first) -%}
|
||||
|
||||
{%- set q = query_msg['content'] -%}
|
||||
{%- set d = doc_msg['content'] -%}
|
||||
|
||||
{# If the doc contains <image> anywhere, hoist a single <image> to the front #}
|
||||
{%- set has_image = ("<image>" in d) -%}
|
||||
{%- set d_clean = d | replace("<image>", "") -%}
|
||||
{%- set q_clean = q | replace("<image>", "") -%}
|
||||
|
||||
{%- if has_image -%}<image>{{ " " }}{%- endif -%}
|
||||
question:{{ q_clean }}{{ " " }}
|
||||
{{ " " }}
|
||||
{{ " " }}passage:{{ d_clean }}
|
||||
355
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
Normal file
355
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
Normal file
@@ -0,0 +1,355 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for the LlamaNemotronVL model family:
|
||||
- nvidia/llama-nemotron-embed-vl-1b-v2 (LlamaNemotronVLForCausalLM / embed)
|
||||
- nvidia/llama-nemotron-rerank-vl-1b-v2
|
||||
(LlamaNemotronVLForSequenceClassification / rerank)
|
||||
|
||||
Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor
|
||||
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartTextParam,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
# Prefixes used by the model API
|
||||
QUERY_PREFIX = "query: "
|
||||
PASSAGE_PREFIX = "passage: "
|
||||
|
||||
# Text prompts for text-only embedding
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X (text embedding queries)
|
||||
f"{QUERY_PREFIX}The label of the object is stop sign",
|
||||
f"{QUERY_PREFIX}cherry blossom",
|
||||
]
|
||||
|
||||
# Image prompts using the model's expected format
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# I -> X (image embedding as passage/document)
|
||||
"stop_sign": f"{PASSAGE_PREFIX}<image>",
|
||||
"cherry_blossom": f"{PASSAGE_PREFIX}<image>",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Run embedding comparison test between HF and vLLM.
|
||||
|
||||
NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
|
||||
"""
|
||||
# Run vLLM inference first
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
# Run HF inference using the model's encode_queries/encode_documents API
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||
hf_outputs = []
|
||||
for text, image in zip(input_texts, input_images):
|
||||
with torch.inference_mode():
|
||||
if text.startswith(QUERY_PREFIX):
|
||||
# Strip prefix and use encode_queries for query texts
|
||||
query_text = text[len(QUERY_PREFIX) :]
|
||||
embedding = hf_model.model.encode_queries([query_text])
|
||||
elif text.startswith(PASSAGE_PREFIX):
|
||||
# Strip prefix and use encode_documents for passages/images
|
||||
passage_text = text[len(PASSAGE_PREFIX) :]
|
||||
if image is not None:
|
||||
# Image document - pass image to encode_documents
|
||||
embedding = hf_model.model.encode_documents(
|
||||
images=[image],
|
||||
texts=[passage_text],
|
||||
)
|
||||
else:
|
||||
# Text-only document
|
||||
embedding = hf_model.model.encode_documents(
|
||||
texts=[passage_text]
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
|
||||
)
|
||||
|
||||
hf_outputs.append(embedding[0].tolist())
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test text-only embedding."""
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test image embedding."""
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reranker tests — nvidia/llama-nemotron-rerank-vl-1b-v2
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RERANKER_MODELS = ["nvidia/llama-nemotron-rerank-vl-1b-v2"]
|
||||
|
||||
# The tokenizer's built-in chat template is not suitable for the Score/Rerank
|
||||
# APIs (it's inherited from the base LLM). We must use the provided override.
|
||||
_RERANKER_SCORE_TEMPLATE = (
|
||||
Path(__file__).parents[4]
|
||||
/ "examples/pooling/score/template/nemotron-vl-rerank.jinja"
|
||||
).read_text()
|
||||
|
||||
RERANKER_TEXT_QUERY = "How is AI improving the intelligence and capabilities of robots?"
|
||||
RERANKER_TEXT_DOCS = [
|
||||
"AI enables robots to perceive, plan, and act autonomously.",
|
||||
(
|
||||
"A biological foundation model designed to analyze DNA, RNA, "
|
||||
"and protein sequences."
|
||||
),
|
||||
]
|
||||
|
||||
RERANKER_IMAGE_QUERY = "photo of a red stop sign on a street"
|
||||
|
||||
|
||||
def _pil_to_data_uri(image) -> str:
|
||||
buf = BytesIO()
|
||||
image.save(buf, format="PNG")
|
||||
b64 = base64.b64encode(buf.getvalue()).decode()
|
||||
return f"data:image/png;base64,{b64}"
|
||||
|
||||
|
||||
def _run_hf_reranker(
|
||||
hf_runner: type[HfRunner],
|
||||
model: str,
|
||||
dtype: str,
|
||||
query: str,
|
||||
docs: list,
|
||||
) -> list[float]:
|
||||
"""Run HF reranker inference; docs is a list of (doc_text, doc_image|None)."""
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
trust_remote_code=True,
|
||||
auto_cls=AutoModelForSequenceClassification,
|
||||
) as hf_model:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model,
|
||||
trust_remote_code=True,
|
||||
max_input_tiles=6,
|
||||
use_thumbnail=True,
|
||||
rerank_max_length=2048,
|
||||
)
|
||||
examples = [
|
||||
{
|
||||
"question": query,
|
||||
"doc_text": doc_text if doc_text is not None else "",
|
||||
"doc_image": doc_image if doc_image is not None else "",
|
||||
}
|
||||
for doc_text, doc_image in docs
|
||||
]
|
||||
batch_dict = processor.process_queries_documents_crossencoder(examples)
|
||||
batch_dict = {
|
||||
k: v.to(hf_model.model.device) if isinstance(v, torch.Tensor) else v
|
||||
for k, v in batch_dict.items()
|
||||
}
|
||||
with torch.inference_mode():
|
||||
logits = hf_model.model(**batch_dict, return_dict=True).logits
|
||||
# vLLM applies sigmoid activation to the raw logits before returning
|
||||
# scores; apply the same here so both sides are comparable.
|
||||
scores = torch.sigmoid(logits.squeeze(-1).float())
|
||||
return scores.detach().cpu().tolist()
|
||||
|
||||
|
||||
def _run_vllm_reranker(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
dtype: str,
|
||||
query: str,
|
||||
docs: list,
|
||||
) -> list[float]:
|
||||
"""Run vLLM reranker inference; docs is a list of (doc_text, doc_image|None)."""
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
) as vllm_model:
|
||||
has_images = any(img is not None for _, img in docs)
|
||||
|
||||
if not has_images:
|
||||
# Text-only path: use the simple string score API.
|
||||
queries = [query] * len(docs)
|
||||
doc_texts = [doc_text for doc_text, _ in docs]
|
||||
outputs = vllm_model.score(
|
||||
queries,
|
||||
doc_texts,
|
||||
chat_template=_RERANKER_SCORE_TEMPLATE,
|
||||
)
|
||||
else:
|
||||
# Multimodal path: build ScoreMultiModalParam for each pair.
|
||||
query_params = [
|
||||
ScoreMultiModalParam(
|
||||
content=[
|
||||
ChatCompletionContentPartTextParam(
|
||||
type="text",
|
||||
text=query,
|
||||
)
|
||||
]
|
||||
)
|
||||
] * len(docs)
|
||||
|
||||
doc_params = []
|
||||
for doc_text, doc_image in docs:
|
||||
content: list = []
|
||||
if doc_image is not None:
|
||||
content.append(
|
||||
ChatCompletionContentPartImageParam(
|
||||
type="image_url",
|
||||
image_url={"url": _pil_to_data_uri(doc_image)},
|
||||
)
|
||||
)
|
||||
if doc_text:
|
||||
content.append(
|
||||
ChatCompletionContentPartTextParam(
|
||||
type="text",
|
||||
text=doc_text,
|
||||
)
|
||||
)
|
||||
doc_params.append(ScoreMultiModalParam(content=content))
|
||||
|
||||
raw_outputs = vllm_model.llm.score(
|
||||
query_params,
|
||||
doc_params,
|
||||
chat_template=_RERANKER_SCORE_TEMPLATE,
|
||||
)
|
||||
outputs = [o.outputs.score for o in raw_outputs]
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def _run_reranker_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
dtype: str,
|
||||
query: str,
|
||||
docs: list,
|
||||
) -> None:
|
||||
"""Compare HF and vLLM reranker scores.
|
||||
|
||||
NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
|
||||
"""
|
||||
vllm_scores = _run_vllm_reranker(vllm_runner, model, dtype, query, docs)
|
||||
hf_scores = _run_hf_reranker(hf_runner, model, dtype, query, docs)
|
||||
|
||||
assert len(hf_scores) == len(vllm_scores), (
|
||||
f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}"
|
||||
)
|
||||
for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)):
|
||||
assert hf_score == pytest.approx(vllm_score, rel=0.02), (
|
||||
f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", RERANKER_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_reranker_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test reranking with text-only query and text documents."""
|
||||
docs = [(text, None) for text in RERANKER_TEXT_DOCS]
|
||||
_run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_TEXT_QUERY, docs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", RERANKER_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_reranker_image_doc(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test reranking with text query against image documents."""
|
||||
docs = [(None, asset.pil_image) for asset in image_assets]
|
||||
_run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_IMAGE_QUERY, docs)
|
||||
@@ -1,148 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for LlamaNemotronVL embedding model (nvidia/llama-nemotron-embed-vl-1b-v2).
|
||||
|
||||
This model uses SigLIP vision encoder with bidirectional LLaMA for embeddings.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
# Prefixes used by the model API
|
||||
QUERY_PREFIX = "query: "
|
||||
PASSAGE_PREFIX = "passage: "
|
||||
|
||||
# Text prompts for text-only embedding
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X (text embedding queries)
|
||||
f"{QUERY_PREFIX}The label of the object is stop sign",
|
||||
f"{QUERY_PREFIX}cherry blossom",
|
||||
]
|
||||
|
||||
# Image prompts using the model's expected format
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# I -> X (image embedding as passage/document)
|
||||
"stop_sign": f"{PASSAGE_PREFIX}<image>",
|
||||
"cherry_blossom": f"{PASSAGE_PREFIX}<image>",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Run embedding comparison test between HF and vLLM.
|
||||
|
||||
NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
|
||||
"""
|
||||
# Run vLLM inference first
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
# Run HF inference using the model's encode_queries/encode_documents API
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||
hf_outputs = []
|
||||
for text, image in zip(input_texts, input_images):
|
||||
with torch.inference_mode():
|
||||
if text.startswith(QUERY_PREFIX):
|
||||
# Strip prefix and use encode_queries for query texts
|
||||
query_text = text[len(QUERY_PREFIX) :]
|
||||
embedding = hf_model.model.encode_queries([query_text])
|
||||
elif text.startswith(PASSAGE_PREFIX):
|
||||
# Strip prefix and use encode_documents for passages/images
|
||||
passage_text = text[len(PASSAGE_PREFIX) :]
|
||||
if image is not None:
|
||||
# Image document - pass image to encode_documents
|
||||
embedding = hf_model.model.encode_documents(
|
||||
images=[image],
|
||||
texts=[passage_text],
|
||||
)
|
||||
else:
|
||||
# Text-only document
|
||||
embedding = hf_model.model.encode_documents(
|
||||
texts=[passage_text]
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
|
||||
)
|
||||
|
||||
hf_outputs.append(embedding[0].tolist())
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test text-only embedding."""
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test image embedding."""
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
@@ -653,6 +653,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
|
||||
"LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
|
||||
"nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
|
||||
),
|
||||
"LlamaNemotronVLForSequenceClassification": _HfExamplesInfo(
|
||||
"nvidia/llama-nemotron-rerank-vl-1b-v2", trust_remote_code=True
|
||||
),
|
||||
"ModernBertForSequenceClassification": _HfExamplesInfo(
|
||||
"Alibaba-NLP/gte-reranker-modernbert-base"
|
||||
),
|
||||
|
||||
@@ -664,6 +664,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
||||
"LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig,
|
||||
"LlamaBidirectionalModel": LlamaBidirectionalConfig,
|
||||
"LlamaNemotronVLModel": LlamaNemotronVLConfig,
|
||||
"LlamaNemotronVLForSequenceClassification": LlamaNemotronVLConfig,
|
||||
"NomicBertModel": NomicBertModelConfig,
|
||||
"Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
|
||||
"Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
# Copyright (c) 2023 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
import math
|
||||
from abc import ABC
|
||||
from collections.abc import Iterable
|
||||
|
||||
@@ -18,6 +19,7 @@ from transformers import AutoModel, PretrainedConfig
|
||||
from transformers.image_processing_utils_fast import BaseImageProcessorFast
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
from vllm.model_executor.layers.pooler import DispatchPooler
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
||||
@@ -42,6 +44,7 @@ from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
|
||||
|
||||
from .interfaces import (
|
||||
MultiModalEmbeddings,
|
||||
SupportsCrossEncoding,
|
||||
SupportsLoRA,
|
||||
SupportsMultiModal,
|
||||
SupportsPP,
|
||||
@@ -883,3 +886,57 @@ class LlamaNemotronVLForEmbedding(LlamaNemotronVLChatModel, VllmModelForPooling)
|
||||
"""Override to use different weight mapping for SigLIP."""
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights, mapper=self.weight_mapper)
|
||||
|
||||
|
||||
class LlamaNemotronVLForSequenceClassification(
|
||||
LlamaNemotronVLForEmbedding, SupportsCrossEncoding
|
||||
):
|
||||
"""LlamaNemotronVL model variant for sequence classification / reranking."""
|
||||
|
||||
# Reranker checkpoint places base model weights under `model.*`,
|
||||
# while `score.*` remains at the top level.
|
||||
weight_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) | (
|
||||
LlamaNemotronVLForEmbedding.weight_mapper
|
||||
)
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
text_config = vllm_config.model_config.hf_config.get_text_config()
|
||||
model_config = vllm_config.model_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.score = ReplicatedLinear(
|
||||
model_config.get_hidden_size(),
|
||||
text_config.num_labels,
|
||||
bias=False,
|
||||
params_dtype=model_config.head_dtype,
|
||||
quant_config=quant_config,
|
||||
return_bias=False,
|
||||
prefix=maybe_prefix(prefix, "score"),
|
||||
)
|
||||
|
||||
pooler_config = model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
self.pooler = DispatchPooler.for_seq_cls(pooler_config, classifier=self.score)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
loaded_weights = super().load_weights(weights)
|
||||
|
||||
# reranker checkpoint omits the inner LM seq-cls head
|
||||
# (`language_model.score.*`). It is unused by this outer model, but
|
||||
# the default loader expects all parameters to be initialized.
|
||||
for name, param in self.named_parameters():
|
||||
if not name.startswith("language_model.score.") or name in loaded_weights:
|
||||
continue
|
||||
|
||||
if name.endswith(".weight"):
|
||||
torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5))
|
||||
elif name.endswith(".bias"):
|
||||
torch.nn.init.zeros_(param)
|
||||
else:
|
||||
torch.nn.init.normal_(param, mean=0.0, std=0.02)
|
||||
|
||||
loaded_weights.add(name)
|
||||
|
||||
return loaded_weights
|
||||
|
||||
@@ -284,6 +284,10 @@ _CROSS_ENCODER_MODELS = {
|
||||
"llama",
|
||||
"LlamaBidirectionalForSequenceClassification",
|
||||
),
|
||||
"LlamaNemotronVLForSequenceClassification": (
|
||||
"nemotron_vl",
|
||||
"LlamaNemotronVLForSequenceClassification",
|
||||
),
|
||||
"ModernBertForSequenceClassification": (
|
||||
"modernbert",
|
||||
"ModernBertForSequenceClassification",
|
||||
|
||||
Reference in New Issue
Block a user