tests/models/multimodal/pooling/test_llama_nemotron_vl_embed.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for LlamaNemotronVL embedding model (nvidia/llama-nemotron-embed-vl-1b-v2).

This model uses SigLIP vision encoder with bidirectional LLaMA for embeddings.
"""

import pytest
import torch
from transformers import AutoModel

from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close

# Prefixes used by the model API
QUERY_PREFIX = "query: "
PASSAGE_PREFIX = "passage: "

# Text prompts for text-only embedding
HF_TEXT_PROMPTS = [
    # T -> X (text embedding queries)
    f"{QUERY_PREFIX}The label of the object is stop sign",
    f"{QUERY_PREFIX}cherry blossom",
]

# Image prompts using the model's expected format
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
    {
        # I -> X (image embedding as passage/document)
        "stop_sign": f"{PASSAGE_PREFIX}<image>",
        "cherry_blossom": f"{PASSAGE_PREFIX}<image>",
    }
)

MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]


def _run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    input_texts: list[str],
    input_images: PromptImageInput,
    model: str,
    *,
    dtype: str,
) -> None:
    """Run embedding comparison test between HF and vLLM.

    NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
    """
    # Run vLLM inference first
    with vllm_runner(
        model,
        runner="pooling",
        dtype=dtype,
        max_model_len=2048,
        enforce_eager=True,
        trust_remote_code=True,
    ) as vllm_model:
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)

    # Run HF inference using the model's encode_queries/encode_documents API
    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
        hf_outputs = []
        for text, image in zip(input_texts, input_images):
            with torch.inference_mode():
                if text.startswith(QUERY_PREFIX):
                    # Strip prefix and use encode_queries for query texts
                    query_text = text[len(QUERY_PREFIX) :]
                    embedding = hf_model.model.encode_queries([query_text])
                elif text.startswith(PASSAGE_PREFIX):
                    # Strip prefix and use encode_documents for passages/images
                    passage_text = text[len(PASSAGE_PREFIX) :]
                    if image is not None:
                        # Image document - pass image to encode_documents
                        embedding = hf_model.model.encode_documents(
                            images=[image],
                            texts=[passage_text],
                        )
                    else:
                        # Text-only document
                        embedding = hf_model.model.encode_documents(
                            texts=[passage_text]
                        )
                else:
                    raise ValueError(
                        f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
                    )

                hf_outputs.append(embedding[0].tolist())

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    """Test text-only embedding."""
    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,  # type: ignore
        model,
        dtype=dtype,
    )


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    """Test image embedding."""
    input_texts_images = [
        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,
        model,
        dtype=dtype,
    )
[Model] Add nvidia/llama-nemotron-embed-vl-1b-v2 multimodal embedding model (#35297) Signed-off-by: Jakub Zakrzewski <jzakrzewski@nvidia.com> 2026-02-26 15:17:17 +01:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""`
			`Tests for LlamaNemotronVL embedding model (nvidia/llama-nemotron-embed-vl-1b-v2).`

			`This model uses SigLIP vision encoder with bidirectional LLaMA for embeddings.`
			`"""`

			`import pytest`
			`import torch`
			`from transformers import AutoModel`

			`from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner`
			`from ...utils import check_embeddings_close`

			`# Prefixes used by the model API`
			`QUERY_PREFIX = "query: "`
			`PASSAGE_PREFIX = "passage: "`

			`# Text prompts for text-only embedding`
			`HF_TEXT_PROMPTS = [`
			`# T -> X (text embedding queries)`
			`f"{QUERY_PREFIX}The label of the object is stop sign",`
			`f"{QUERY_PREFIX}cherry blossom",`
			`]`

			`# Image prompts using the model's expected format`
			`HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(`
			`{`
			`# I -> X (image embedding as passage/document)`
			`"stop_sign": f"{PASSAGE_PREFIX}<image>",`
			`"cherry_blossom": f"{PASSAGE_PREFIX}<image>",`
			`}`
			`)`

			`MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]`


			`def _run_test(`
			`hf_runner: type[HfRunner],`
			`vllm_runner: type[VllmRunner],`
			`input_texts: list[str],`
			`input_images: PromptImageInput,`
			`model: str,`
			`*,`
			`dtype: str,`
			`) -> None:`
			`"""Run embedding comparison test between HF and vLLM.`

			`NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.`
			`"""`
			`# Run vLLM inference first`
			`with vllm_runner(`
			`model,`
			`runner="pooling",`
			`dtype=dtype,`
			`max_model_len=2048,`
			`enforce_eager=True,`
			`trust_remote_code=True,`
			`) as vllm_model:`
			`vllm_outputs = vllm_model.embed(input_texts, images=input_images)`

			`# Run HF inference using the model's encode_queries/encode_documents API`
			`with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:`
			`hf_outputs = []`
			`for text, image in zip(input_texts, input_images):`
			`with torch.inference_mode():`
			`if text.startswith(QUERY_PREFIX):`
			`# Strip prefix and use encode_queries for query texts`
			`query_text = text[len(QUERY_PREFIX) :]`
			`embedding = hf_model.model.encode_queries([query_text])`
			`elif text.startswith(PASSAGE_PREFIX):`
			`# Strip prefix and use encode_documents for passages/images`
			`passage_text = text[len(PASSAGE_PREFIX) :]`
			`if image is not None:`
			`# Image document - pass image to encode_documents`
			`embedding = hf_model.model.encode_documents(`
			`images=[image],`
			`texts=[passage_text],`
			`)`
			`else:`
			`# Text-only document`
			`embedding = hf_model.model.encode_documents(`
			`texts=[passage_text]`
			`)`
			`else:`
			`raise ValueError(`
			`f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"`
			`)`

			`hf_outputs.append(embedding[0].tolist())`

			`check_embeddings_close(`
			`embeddings_0_lst=hf_outputs,`
			`embeddings_1_lst=vllm_outputs,`
			`name_0="hf",`
			`name_1="vllm",`
			`)`


			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("dtype", ["bfloat16"])`
			`def test_models_text(`
			`hf_runner,`
			`vllm_runner,`
			`image_assets,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
			`"""Test text-only embedding."""`
			`input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]`
			`input_texts = [text for text, _ in input_texts_images]`
			`input_images = [image for _, image in input_texts_images]`

			`_run_test(`
			`hf_runner,`
			`vllm_runner,`
			`input_texts,`
			`input_images, # type: ignore`
			`model,`
			`dtype=dtype,`
			`)`


			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("dtype", ["bfloat16"])`
			`def test_models_image(`
			`hf_runner,`
			`vllm_runner,`
			`image_assets,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
			`"""Test image embedding."""`
			`input_texts_images = [`
			`(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)`
			`]`
			`input_texts = [text for text, _ in input_texts_images]`
			`input_images = [image for _, image in input_texts_images]`

			`_run_test(`
			`hf_runner,`
			`vllm_runner,`
			`input_texts,`
			`input_images,`
			`model,`
			`dtype=dtype,`
			`)`