# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests for the LlamaNemotronVL model family: - nvidia/llama-nemotron-embed-vl-1b-v2 (LlamaNemotronVLForCausalLM / embed) - nvidia/llama-nemotron-rerank-vl-1b-v2 (LlamaNemotronVLForSequenceClassification / rerank) Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone. """ from io import BytesIO from pathlib import Path import pybase64 as base64 import pytest import torch from transformers import AutoModel, AutoModelForSequenceClassification, AutoProcessor from vllm.entrypoints.chat_utils import ( ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam, ) from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam from vllm.platforms import current_platform from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....utils import ROCM_ENGINE_KWARGS from ...utils import check_embeddings_close # Prefixes used by the model API QUERY_PREFIX = "query: " PASSAGE_PREFIX = "passage: " # Text prompts for text-only embedding HF_TEXT_PROMPTS = [ # T -> X (text embedding queries) f"{QUERY_PREFIX}The label of the object is stop sign", f"{QUERY_PREFIX}cherry blossom", ] # Image prompts using the model's expected format HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts( { # I -> X (image embedding as passage/document) "stop_sign": f"{PASSAGE_PREFIX}", "cherry_blossom": f"{PASSAGE_PREFIX}", } ) MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"] def _run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], input_texts: list[str], input_images: PromptImageInput, model: str, *, dtype: str, ) -> None: """Run embedding comparison test between HF and vLLM. NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing. """ # Run vLLM inference first with vllm_runner( model, runner="pooling", dtype=dtype, max_model_len=2048, enforce_eager=True, trust_remote_code=True, **ROCM_ENGINE_KWARGS, ) as vllm_model: vllm_outputs = vllm_model.embed(input_texts, images=input_images) # Run HF inference using the model's encode_queries/encode_documents API with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: hf_outputs = [] for text, image in zip(input_texts, input_images): with torch.inference_mode(): if text.startswith(QUERY_PREFIX): # Strip prefix and use encode_queries for query texts query_text = text[len(QUERY_PREFIX) :] embedding = hf_model.model.encode_queries([query_text]) elif text.startswith(PASSAGE_PREFIX): # Strip prefix and use encode_documents for passages/images passage_text = text[len(PASSAGE_PREFIX) :] if image is not None: # Image document - pass image to encode_documents embedding = hf_model.model.encode_documents( images=[image], texts=[passage_text], ) else: # Text-only document embedding = hf_model.model.encode_documents( texts=[passage_text] ) else: raise ValueError( f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'" ) hf_outputs.append(embedding[0].tolist()) check_embeddings_close( embeddings_0_lst=hf_outputs, embeddings_1_lst=vllm_outputs, name_0="hf", name_1="vllm", ) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( hf_runner, vllm_runner, image_assets, model: str, dtype: str, ) -> None: """Test text-only embedding.""" input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS] input_texts = [text for text, _ in input_texts_images] input_images = [image for _, image in input_texts_images] _run_test( hf_runner, vllm_runner, input_texts, input_images, # type: ignore model, dtype=dtype, ) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_image( hf_runner, vllm_runner, image_assets, model: str, dtype: str, ) -> None: """Test image embedding.""" input_texts_images = [ (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) ] input_texts = [text for text, _ in input_texts_images] input_images = [image for _, image in input_texts_images] _run_test( hf_runner, vllm_runner, input_texts, input_images, model, dtype=dtype, ) # --------------------------------------------------------------------------- # Reranker tests — nvidia/llama-nemotron-rerank-vl-1b-v2 # --------------------------------------------------------------------------- RERANKER_MODELS = ["nvidia/llama-nemotron-rerank-vl-1b-v2"] # The tokenizer's built-in chat template is not suitable for the Score/Rerank # APIs (it's inherited from the base LLM). We must use the provided override. _RERANKER_SCORE_TEMPLATE = ( Path(__file__).parents[4] / "examples/pooling/score/template/nemotron-vl-rerank.jinja" ).read_text() RERANKER_TEXT_QUERY = "How is AI improving the intelligence and capabilities of robots?" RERANKER_TEXT_DOCS = [ "AI enables robots to perceive, plan, and act autonomously.", ( "A biological foundation model designed to analyze DNA, RNA, " "and protein sequences." ), ] RERANKER_IMAGE_QUERY = "photo of a red stop sign on a street" def _pil_to_data_uri(image) -> str: buf = BytesIO() image.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() return f"data:image/png;base64,{b64}" def _run_hf_reranker( hf_runner: type[HfRunner], model: str, dtype: str, query: str, docs: list, ) -> list[float]: """Run HF reranker inference; docs is a list of (doc_text, doc_image|None).""" with hf_runner( model, dtype=dtype, trust_remote_code=True, auto_cls=AutoModelForSequenceClassification, ) as hf_model: processor = AutoProcessor.from_pretrained( model, trust_remote_code=True, max_input_tiles=6, use_thumbnail=True, rerank_max_length=2048, ) examples = [ { "question": query, "doc_text": doc_text if doc_text is not None else "", "doc_image": doc_image if doc_image is not None else "", } for doc_text, doc_image in docs ] batch_dict = processor.process_queries_documents_crossencoder(examples) batch_dict = { k: v.to(hf_model.model.device) if isinstance(v, torch.Tensor) else v for k, v in batch_dict.items() } with torch.inference_mode(): logits = hf_model.model(**batch_dict, return_dict=True).logits # vLLM applies sigmoid activation to the raw logits before returning # scores; apply the same here so both sides are comparable. scores = torch.sigmoid(logits.squeeze(-1).float()) return scores.detach().cpu().tolist() def _run_vllm_reranker( vllm_runner: type[VllmRunner], model: str, dtype: str, query: str, docs: list, ) -> list[float]: """Run vLLM reranker inference; docs is a list of (doc_text, doc_image|None).""" with vllm_runner( model, runner="pooling", dtype=dtype, max_model_len=2048, enforce_eager=True, trust_remote_code=True, **ROCM_ENGINE_KWARGS, ) as vllm_model: has_images = any(img is not None for _, img in docs) if not has_images: # Text-only path: use the simple string score API. queries = [query] * len(docs) doc_texts = [doc_text for doc_text, _ in docs] outputs = vllm_model.score( queries, doc_texts, chat_template=_RERANKER_SCORE_TEMPLATE, ) else: # Multimodal path: build ScoreMultiModalParam for each pair. query_params = [ ScoreMultiModalParam( content=[ ChatCompletionContentPartTextParam( type="text", text=query, ) ] ) ] * len(docs) doc_params = [] for doc_text, doc_image in docs: content: list = [] if doc_image is not None: content.append( ChatCompletionContentPartImageParam( type="image_url", image_url={"url": _pil_to_data_uri(doc_image)}, ) ) if doc_text: content.append( ChatCompletionContentPartTextParam( type="text", text=doc_text, ) ) doc_params.append(ScoreMultiModalParam(content=content)) raw_outputs = vllm_model.llm.score( query_params, doc_params, chat_template=_RERANKER_SCORE_TEMPLATE, ) outputs = [o.outputs.score for o in raw_outputs] return outputs def _run_reranker_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], model: str, dtype: str, query: str, docs: list, ) -> None: """Compare HF and vLLM reranker scores. NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing. """ vllm_scores = _run_vllm_reranker(vllm_runner, model, dtype, query, docs) hf_scores = _run_hf_reranker(hf_runner, model, dtype, query, docs) assert len(hf_scores) == len(vllm_scores), ( f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}" ) # NOTE: ROCm shows slightly higher numerical variance dues to different attention # backend between vLLM and HF; use a marginally looser tolerance rel_tol = 0.022 if current_platform.is_rocm() else 0.02 for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)): assert hf_score == pytest.approx(vllm_score, rel=rel_tol), ( f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}" ) @pytest.mark.parametrize("model", RERANKER_MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_reranker_text( hf_runner, vllm_runner, model: str, dtype: str, ) -> None: """Test reranking with text-only query and text documents.""" docs = [(text, None) for text in RERANKER_TEXT_DOCS] _run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_TEXT_QUERY, docs) @pytest.mark.parametrize("model", RERANKER_MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_reranker_image_doc( hf_runner, vllm_runner, image_assets, model: str, dtype: str, ) -> None: """Test reranking with text query against image documents.""" docs = [(None, asset.pil_image) for asset in image_assets] _run_reranker_test(hf_runner, vllm_runner, model, dtype, RERANKER_IMAGE_QUERY, docs)