[Model] Add support for nvidia/llama-nemotron-rerank-vl-1b-v2 (#35735)
Signed-off-by: Jakub Zakrzewski <jzakrzewski@nvidia.com>
This commit is contained in:
355
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
Normal file
355
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
Normal file
@@ -0,0 +1,355 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for the LlamaNemotronVL model family:
|
||||
- nvidia/llama-nemotron-embed-vl-1b-v2 (LlamaNemotronVLForCausalLM / embed)
|
||||
- nvidia/llama-nemotron-rerank-vl-1b-v2
|
||||
(LlamaNemotronVLForSequenceClassification / rerank)
|
||||
|
||||
Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor
|
||||
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartTextParam,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
# Prefixes used by the model API
|
||||
QUERY_PREFIX = "query: "
|
||||
PASSAGE_PREFIX = "passage: "
|
||||
|
||||
# Text prompts for text-only embedding
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X (text embedding queries)
|
||||
f"{QUERY_PREFIX}The label of the object is stop sign",
|
||||
f"{QUERY_PREFIX}cherry blossom",
|
||||
]
|
||||
|
||||
# Image prompts using the model's expected format
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# I -> X (image embedding as passage/document)
|
||||
"stop_sign": f"{PASSAGE_PREFIX}<image>",
|
||||
"cherry_blossom": f"{PASSAGE_PREFIX}<image>",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Run embedding comparison test between HF and vLLM.
|
||||
|
||||
NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
|
||||
"""
|
||||
# Run vLLM inference first
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
# Run HF inference using the model's encode_queries/encode_documents API
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||
hf_outputs = []
|
||||
for text, image in zip(input_texts, input_images):
|
||||
with torch.inference_mode():
|
||||
if text.startswith(QUERY_PREFIX):
|
||||
# Strip prefix and use encode_queries for query texts
|
||||
query_text = text[len(QUERY_PREFIX) :]
|
||||
embedding = hf_model.model.encode_queries([query_text])
|
||||
elif text.startswith(PASSAGE_PREFIX):
|
||||
# Strip prefix and use encode_documents for passages/images
|
||||
passage_text = text[len(PASSAGE_PREFIX) :]
|
||||
if image is not None:
|
||||
# Image document - pass image to encode_documents
|
||||
embedding = hf_model.model.encode_documents(
|
||||
images=[image],
|
||||
texts=[passage_text],
|
||||
)
|
||||
else:
|
||||
# Text-only document
|
||||
embedding = hf_model.model.encode_documents(
|
||||
texts=[passage_text]
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
|
||||
)
|
||||
|
||||
hf_outputs.append(embedding[0].tolist())
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test text-only embedding."""
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test image embedding."""
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reranker tests — nvidia/llama-nemotron-rerank-vl-1b-v2
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RERANKER_MODELS = ["nvidia/llama-nemotron-rerank-vl-1b-v2"]
|
||||
|
||||
# The tokenizer's built-in chat template is not suitable for the Score/Rerank
|
||||
# APIs (it's inherited from the base LLM). We must use the provided override.
|
||||
_RERANKER_SCORE_TEMPLATE = (
|
||||
Path(__file__).parents[4]
|
||||
/ "examples/pooling/score/template/nemotron-vl-rerank.jinja"
|
||||
).read_text()
|
||||
|
||||
RERANKER_TEXT_QUERY = "How is AI improving the intelligence and capabilities of robots?"
|
||||
RERANKER_TEXT_DOCS = [
|
||||
"AI enables robots to perceive, plan, and act autonomously.",
|
||||
(
|
||||
"A biological foundation model designed to analyze DNA, RNA, "
|
||||
"and protein sequences."
|
||||
),
|
||||
]
|
||||
|
||||
RERANKER_IMAGE_QUERY = "photo of a red stop sign on a street"
|
||||
|
||||
|
||||
def _pil_to_data_uri(image) -> str:
|
||||
buf = BytesIO()
|
||||
image.save(buf, format="PNG")
|
||||
b64 = base64.b64encode(buf.getvalue()).decode()
|
||||
return f"data:image/png;base64,{b64}"
|
||||
|
||||
|
||||
def _run_hf_reranker(
|
||||
hf_runner: type[HfRunner],
|
||||
model: str,
|
||||
dtype: str,
|
||||
query: str,
|
||||
docs: list,
|
||||
) -> list[float]:
|
||||
"""Run HF reranker inference; docs is a list of (doc_text, doc_image|None)."""
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
trust_remote_code=True,
|
||||
auto_cls=AutoModelForSequenceClassification,
|
||||
) as hf_model:
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model,
|
||||
trust_remote_code=True,
|
||||
max_input_tiles=6,
|
||||
use_thumbnail=True,
|
||||
rerank_max_length=2048,
|
||||
)
|
||||
examples = [
|
||||
{
|
||||
"question": query,
|
||||
"doc_text": doc_text if doc_text is not None else "",
|
||||
"doc_image": doc_image if doc_image is not None else "",
|
||||
}
|
||||
for doc_text, doc_image in docs
|
||||
]
|
||||
batch_dict = processor.process_queries_documents_crossencoder(examples)
|
||||
batch_dict = {
|
||||
k: v.to(hf_model.model.device) if isinstance(v, torch.Tensor) else v
|
||||
for k, v in batch_dict.items()
|
||||
}
|
||||
with torch.inference_mode():
|
||||
logits = hf_model.model(**batch_dict, return_dict=True).logits
|
||||
# vLLM applies sigmoid activation to the raw logits before returning
|
||||
# scores; apply the same here so both sides are comparable.
|
||||
scores = torch.sigmoid(logits.squeeze(-1).float())
|
||||
return scores.detach().cpu().tolist()
|
||||
|
||||
|
||||
def _run_vllm_reranker(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
dtype: str,
|
||||
query: str,
|
||||
docs: list,
|
||||
) -> list[float]:
|
||||
"""Run vLLM reranker inference; docs is a list of (doc_text, doc_image|None)."""
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
) as vllm_model:
|
||||
has_images = any(img is not None for _, img in docs)
|
||||
|
||||
if not has_images:
|
||||
# Text-only path: use the simple string score API.
|
||||
queries = [query] * len(docs)
|
||||
doc_texts = [doc_text for doc_text, _ in docs]
|
||||
outputs = vllm_model.score(
|
||||
queries,
|
||||
doc_texts,
|
||||
chat_template=_RERANKER_SCORE_TEMPLATE,
|
||||
)
|
||||
else:
|
||||
# Multimodal path: build ScoreMultiModalParam for each pair.
|
||||
query_params = [
|
||||
ScoreMultiModalParam(
|
||||
content=[
|
||||
ChatCompletionContentPartTextParam(
|
||||
type="text",
|
||||
text=query,
|
||||
)
|
||||
]
|
||||
)
|
||||
] * len(docs)
|
||||
|
||||
doc_params = []
|
||||
for doc_text, doc_image in docs:
|
||||
content: list = []
|
||||
if doc_image is not None:
|
||||
content.append(
|
||||
ChatCompletionContentPartImageParam(
|
||||
type="image_url",
|
||||
image_url={"url": _pil_to_data_uri(doc_image)},
|
||||
)
|
||||
)
|
||||
if doc_text:
|
||||
content.append(
|
||||
ChatCompletionContentPartTextParam(
|
||||
type="text",
|
||||
text=doc_text,
|
||||
)
|
||||
)
|
||||
doc_params.append(ScoreMultiModalParam(content=content))
|
||||
|
||||
raw_outputs = vllm_model.llm.score(
|
||||
query_params,
|
||||
doc_params,
|
||||
chat_template=_RERANKER_SCORE_TEMPLATE,
|
||||
)
|
||||
outputs = [o.outputs.score for o in raw_outputs]
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def _run_reranker_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
dtype: str,
|
||||
query: str,
|
||||
docs: list,
|
||||
) -> None:
|
||||
"""Compare HF and vLLM reranker scores.
|
||||
|
||||
NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
|
||||
"""
|
||||
vllm_scores = _run_vllm_reranker(vllm_runner, model, dtype, query, docs)
|
||||
hf_scores = _run_hf_reranker(hf_runner, model, dtype, query, docs)
|
||||
|
||||
assert len(hf_scores) == len(vllm_scores), (
|
||||
f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}"
|
||||
)
|
||||
for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)):
|
||||
assert hf_score == pytest.approx(vllm_score, rel=0.02), (
|
||||
f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", RERANKER_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_reranker_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test reranking with text-only query and text documents."""
|
||||
docs = [(text, None) for text in RERANKER_TEXT_DOCS]
|
||||
_run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_TEXT_QUERY, docs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", RERANKER_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_reranker_image_doc(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test reranking with text query against image documents."""
|
||||
docs = [(None, asset.pil_image) for asset in image_assets]
|
||||
_run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_IMAGE_QUERY, docs)
|
||||
@@ -1,148 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for LlamaNemotronVL embedding model (nvidia/llama-nemotron-embed-vl-1b-v2).
|
||||
|
||||
This model uses SigLIP vision encoder with bidirectional LLaMA for embeddings.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
# Prefixes used by the model API
|
||||
QUERY_PREFIX = "query: "
|
||||
PASSAGE_PREFIX = "passage: "
|
||||
|
||||
# Text prompts for text-only embedding
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X (text embedding queries)
|
||||
f"{QUERY_PREFIX}The label of the object is stop sign",
|
||||
f"{QUERY_PREFIX}cherry blossom",
|
||||
]
|
||||
|
||||
# Image prompts using the model's expected format
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# I -> X (image embedding as passage/document)
|
||||
"stop_sign": f"{PASSAGE_PREFIX}<image>",
|
||||
"cherry_blossom": f"{PASSAGE_PREFIX}<image>",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
input_texts: list[str],
|
||||
input_images: PromptImageInput,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Run embedding comparison test between HF and vLLM.
|
||||
|
||||
NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
|
||||
"""
|
||||
# Run vLLM inference first
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=2048,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
# Run HF inference using the model's encode_queries/encode_documents API
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||
hf_outputs = []
|
||||
for text, image in zip(input_texts, input_images):
|
||||
with torch.inference_mode():
|
||||
if text.startswith(QUERY_PREFIX):
|
||||
# Strip prefix and use encode_queries for query texts
|
||||
query_text = text[len(QUERY_PREFIX) :]
|
||||
embedding = hf_model.model.encode_queries([query_text])
|
||||
elif text.startswith(PASSAGE_PREFIX):
|
||||
# Strip prefix and use encode_documents for passages/images
|
||||
passage_text = text[len(PASSAGE_PREFIX) :]
|
||||
if image is not None:
|
||||
# Image document - pass image to encode_documents
|
||||
embedding = hf_model.model.encode_documents(
|
||||
images=[image],
|
||||
texts=[passage_text],
|
||||
)
|
||||
else:
|
||||
# Text-only document
|
||||
embedding = hf_model.model.encode_documents(
|
||||
texts=[passage_text]
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
|
||||
)
|
||||
|
||||
hf_outputs.append(embedding[0].tolist())
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_text(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test text-only embedding."""
|
||||
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images, # type: ignore
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_models_image(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Test image embedding."""
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
_run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
input_texts,
|
||||
input_images,
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
Reference in New Issue
Block a user