vllm/tests/models/multimodal/pooling/test_llama_nemotron_vl.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for the LlamaNemotronVL model family:
  - nvidia/llama-nemotron-embed-vl-1b-v2  (LlamaNemotronVLForCausalLM / embed)
  - nvidia/llama-nemotron-rerank-vl-1b-v2
      (LlamaNemotronVLForSequenceClassification / rerank)

Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
"""

import base64
from io import BytesIO
from pathlib import Path

import pytest
import torch
from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor

from vllm.entrypoints.chat_utils import (
    ChatCompletionContentPartImageParam,
    ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam

from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close

# Prefixes used by the model API
QUERY_PREFIX = "query: "
PASSAGE_PREFIX = "passage: "

# Text prompts for text-only embedding
HF_TEXT_PROMPTS = [
    # T -> X (text embedding queries)
    f"{QUERY_PREFIX}The label of the object is stop sign",
    f"{QUERY_PREFIX}cherry blossom",
]

# Image prompts using the model's expected format
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
    {
        # I -> X (image embedding as passage/document)
        "stop_sign": f"{PASSAGE_PREFIX}<image>",
        "cherry_blossom": f"{PASSAGE_PREFIX}<image>",
    }
)

MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]


def _run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    input_texts: list[str],
    input_images: PromptImageInput,
    model: str,
    *,
    dtype: str,
) -> None:
    """Run embedding comparison test between HF and vLLM.

    NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
    """
    # Run vLLM inference first
    with vllm_runner(
        model,
        runner="pooling",
        dtype=dtype,
        max_model_len=2048,
        enforce_eager=True,
        trust_remote_code=True,
    ) as vllm_model:
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)

    # Run HF inference using the model's encode_queries/encode_documents API
    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
        hf_outputs = []
        for text, image in zip(input_texts, input_images):
            with torch.inference_mode():
                if text.startswith(QUERY_PREFIX):
                    # Strip prefix and use encode_queries for query texts
                    query_text = text[len(QUERY_PREFIX) :]
                    embedding = hf_model.model.encode_queries([query_text])
                elif text.startswith(PASSAGE_PREFIX):
                    # Strip prefix and use encode_documents for passages/images
                    passage_text = text[len(PASSAGE_PREFIX) :]
                    if image is not None:
                        # Image document - pass image to encode_documents
                        embedding = hf_model.model.encode_documents(
                            images=[image],
                            texts=[passage_text],
                        )
                    else:
                        # Text-only document
                        embedding = hf_model.model.encode_documents(
                            texts=[passage_text]
                        )
                else:
                    raise ValueError(
                        f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
                    )

                hf_outputs.append(embedding[0].tolist())

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    """Test text-only embedding."""
    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,  # type: ignore
        model,
        dtype=dtype,
    )


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    """Test image embedding."""
    input_texts_images = [
        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,
        model,
        dtype=dtype,
    )


# ---------------------------------------------------------------------------
# Reranker tests — nvidia/llama-nemotron-rerank-vl-1b-v2
# ---------------------------------------------------------------------------

RERANKER_MODELS = ["nvidia/llama-nemotron-rerank-vl-1b-v2"]

# The tokenizer's built-in chat template is not suitable for the Score/Rerank
# APIs (it's inherited from the base LLM).  We must use the provided override.
_RERANKER_SCORE_TEMPLATE = (
    Path(__file__).parents[4]
    / "examples/pooling/score/template/nemotron-vl-rerank.jinja"
).read_text()

RERANKER_TEXT_QUERY = "How is AI improving the intelligence and capabilities of robots?"
RERANKER_TEXT_DOCS = [
    "AI enables robots to perceive, plan, and act autonomously.",
    (
        "A biological foundation model designed to analyze DNA, RNA, "
        "and protein sequences."
    ),
]

RERANKER_IMAGE_QUERY = "photo of a red stop sign on a street"


def _pil_to_data_uri(image) -> str:
    buf = BytesIO()
    image.save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode()
    return f"data:image/png;base64,{b64}"


def _run_hf_reranker(
    hf_runner: type[HfRunner],
    model: str,
    dtype: str,
    query: str,
    docs: list,
) -> list[float]:
    """Run HF reranker inference; docs is a list of (doc_text, doc_image|None)."""
    with hf_runner(
        model,
        dtype=dtype,
        trust_remote_code=True,
        auto_cls=AutoModelForSequenceClassification,
    ) as hf_model:
        processor = AutoProcessor.from_pretrained(
            model,
            trust_remote_code=True,
            max_input_tiles=6,
            use_thumbnail=True,
            rerank_max_length=2048,
        )
        examples = [
            {
                "question": query,
                "doc_text": doc_text if doc_text is not None else "",
                "doc_image": doc_image if doc_image is not None else "",
            }
            for doc_text, doc_image in docs
        ]
        batch_dict = processor.process_queries_documents_crossencoder(examples)
        batch_dict = {
            k: v.to(hf_model.model.device) if isinstance(v, torch.Tensor) else v
            for k, v in batch_dict.items()
        }
        with torch.inference_mode():
            logits = hf_model.model(**batch_dict, return_dict=True).logits
        # vLLM applies sigmoid activation to the raw logits before returning
        # scores; apply the same here so both sides are comparable.
        scores = torch.sigmoid(logits.squeeze(-1).float())
        return scores.detach().cpu().tolist()


def _run_vllm_reranker(
    vllm_runner: type[VllmRunner],
    model: str,
    dtype: str,
    query: str,
    docs: list,
) -> list[float]:
    """Run vLLM reranker inference; docs is a list of (doc_text, doc_image|None)."""
    with vllm_runner(
        model,
        runner="pooling",
        dtype=dtype,
        max_model_len=2048,
        enforce_eager=True,
        trust_remote_code=True,
    ) as vllm_model:
        has_images = any(img is not None for _, img in docs)

        if not has_images:
            # Text-only path: use the simple string score API.
            queries = [query] * len(docs)
            doc_texts = [doc_text for doc_text, _ in docs]
            outputs = vllm_model.score(
                queries,
                doc_texts,
                chat_template=_RERANKER_SCORE_TEMPLATE,
            )
        else:
            # Multimodal path: build ScoreMultiModalParam for each pair.
            query_params = [
                ScoreMultiModalParam(
                    content=[
                        ChatCompletionContentPartTextParam(
                            type="text",
                            text=query,
                        )
                    ]
                )
            ] * len(docs)

            doc_params = []
            for doc_text, doc_image in docs:
                content: list = []
                if doc_image is not None:
                    content.append(
                        ChatCompletionContentPartImageParam(
                            type="image_url",
                            image_url={"url": _pil_to_data_uri(doc_image)},
                        )
                    )
                if doc_text:
                    content.append(
                        ChatCompletionContentPartTextParam(
                            type="text",
                            text=doc_text,
                        )
                    )
                doc_params.append(ScoreMultiModalParam(content=content))

            raw_outputs = vllm_model.llm.score(
                query_params,
                doc_params,
                chat_template=_RERANKER_SCORE_TEMPLATE,
            )
            outputs = [o.outputs.score for o in raw_outputs]

    return outputs


def _run_reranker_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    model: str,
    dtype: str,
    query: str,
    docs: list,
) -> None:
    """Compare HF and vLLM reranker scores.

    NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
    """
    vllm_scores = _run_vllm_reranker(vllm_runner, model, dtype, query, docs)
    hf_scores = _run_hf_reranker(hf_runner, model, dtype, query, docs)

    assert len(hf_scores) == len(vllm_scores), (
        f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}"
    )
    for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)):
        assert hf_score == pytest.approx(vllm_score, rel=0.02), (
            f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}"
        )


@pytest.mark.parametrize("model", RERANKER_MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_reranker_text(
    hf_runner,
    vllm_runner,
    model: str,
    dtype: str,
) -> None:
    """Test reranking with text-only query and text documents."""
    docs = [(text, None) for text in RERANKER_TEXT_DOCS]
    _run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_TEXT_QUERY, docs)


@pytest.mark.parametrize("model", RERANKER_MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_reranker_image_doc(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    """Test reranking with text query against image documents."""
    docs = [(None, asset.pil_image) for asset in image_assets]
    _run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_IMAGE_QUERY, docs)