[Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574)

Signed-off-by: craftsangjae <craftsangjae@gmail.com>
2026-02-21 13:01:40 +09:00
parent 11be2c74dc
commit 5719a4e4e6
10 changed files with 532 additions and 66 deletions
--- a/tests/models/multimodal/pooling/test_colqwen3.py
+++ b/tests/models/multimodal/pooling/test_colqwen3.py
@@ -7,19 +7,31 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
 embeddings for both text and image inputs.
 """

+import base64
+from io import BytesIO
+
 import pytest
 import torch
+from PIL import Image
+
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartTextParam,
+)
+from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam

 from ....conftest import VllmRunner

 MODELS = [
    "TomoroAI/tomoro-colqwen3-embed-4b",
    "OpenSearch-AI/Ops-Colqwen3-4B",
+    "nvidia/nemotron-colembed-vl-4b-v2",
 ]

 EMBED_DIMS = {
    "TomoroAI/tomoro-colqwen3-embed-4b": 320,
    "OpenSearch-AI/Ops-Colqwen3-4B": 2560,
+    "nvidia/nemotron-colembed-vl-4b-v2": 2560,
 }

 TEXT_QUERIES = [
@@ -33,6 +45,43 @@ TEXT_DOCUMENTS = [
 ]

 DTYPE = "half"
+GPU_MEMORY_UTILIZATION = 0.7
+
+
+def _make_base64_image(
+    width: int = 64, height: int = 64, color: tuple[int, int, int] = (255, 0, 0)
+) -> str:
+    """Create a small solid-color PNG image and return its base64 data URI."""
+    img = Image.new("RGB", (width, height), color)
+    buf = BytesIO()
+    img.save(buf, format="PNG")
+    b64 = base64.b64encode(buf.getvalue()).decode()
+    return f"data:image/png;base64,{b64}"
+
+
+def _make_image_mm_param(
+    image_uri: str,
+    text: str | None = None,
+) -> ScoreMultiModalParam:
+    """Build a ScoreMultiModalParam containing an image (and optional text)."""
+    content: list = [
+        ChatCompletionContentPartImageParam(
+            type="image_url",
+            image_url={"url": image_uri},
+        ),
+    ]
+    if text is not None:
+        content.append(
+            ChatCompletionContentPartTextParam(type="text", text=text),
+        )
+    return ScoreMultiModalParam(content=content)
+
+
+def _make_text_mm_param(text: str) -> ScoreMultiModalParam:
+    """Build a ScoreMultiModalParam containing only text."""
+    return ScoreMultiModalParam(
+        content=[ChatCompletionContentPartTextParam(type="text", text=text)],
+    )


 def _run_token_embed_test(
@@ -48,6 +97,7 @@ def _run_token_embed_test(
        dtype=dtype,
        max_model_len=4096,
        enforce_eager=True,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    ) as vllm_model:
        outputs = vllm_model.token_embed([TEXT_QUERIES[0]])

@@ -83,6 +133,7 @@ def _run_late_interaction_test(
        dtype=dtype,
        max_model_len=4096,
        enforce_eager=True,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    ) as vllm_model:
        q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
        d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]])
@@ -118,6 +169,7 @@ def _run_relevance_test(
        dtype=dtype,
        max_model_len=4096,
        enforce_eager=True,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    ) as vllm_model:
        scores = vllm_model.score(query, documents)

@@ -154,3 +206,142 @@ def test_colqwen3_relevance_ordering(
    dtype: str,
 ) -> None:
    _run_relevance_test(vllm_runner, model, dtype=dtype)
+
+
+# ── Multimodal scoring tests ────────────────────────────────
+
+
+def _run_multimodal_text_query_image_docs_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Score a text query against image documents via the multimodal path.
+
+    Verifies that score_data_to_prompts correctly handles image content
+    and produces valid MaxSim scores.
+    """
+    red_image = _make_base64_image(64, 64, color=(255, 0, 0))
+    blue_image = _make_base64_image(64, 64, color=(0, 0, 255))
+
+    query = "Describe the red object"
+    image_docs = [
+        _make_image_mm_param(red_image),
+        _make_image_mm_param(blue_image),
+    ]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+    ) as vllm_model:
+        scores = vllm_model.llm.score(query, image_docs)
+
+        assert len(scores) == 2
+        for s in scores:
+            assert isinstance(s.outputs.score, float)
+
+
+def _run_multimodal_mixed_docs_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Score a text query against a mix of text and image documents.
+
+    Ensures the late-interaction path handles heterogeneous document
+    types (plain strings alongside ScoreMultiModalParam images) in
+    a single call.
+    """
+    red_image = _make_base64_image(64, 64, color=(255, 0, 0))
+
+    query = "What is the capital of France?"
+    documents: list = [
+        "The capital of France is Paris.",
+        _make_image_mm_param(red_image),
+    ]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+    ) as vllm_model:
+        scores = vllm_model.llm.score(query, documents)
+
+        assert len(scores) == 2
+        for s in scores:
+            assert isinstance(s.outputs.score, float)
+        # Text document about France should score higher than a random image
+        assert scores[0].outputs.score > scores[1].outputs.score
+
+
+def _run_multimodal_image_query_text_docs_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Score an image query against text documents.
+
+    Verifies the reverse direction: multimodal query with text-only
+    documents through the late-interaction scoring path.
+    """
+    red_image = _make_base64_image(64, 64, color=(255, 0, 0))
+    image_query = _make_image_mm_param(red_image, text="red color")
+
+    documents = [
+        "A bright red sports car.",
+        "The weather forecast shows rain tomorrow.",
+    ]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+        gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
+    ) as vllm_model:
+        scores = vllm_model.llm.score(image_query, documents)
+
+        assert len(scores) == 2
+        for s in scores:
+            assert isinstance(s.outputs.score, float)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_multimodal_text_query_image_docs(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_multimodal_text_query_image_docs_test(vllm_runner, model, dtype=dtype)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_multimodal_mixed_docs(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_multimodal_mixed_docs_test(vllm_runner, model, dtype=dtype)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_multimodal_image_query_text_docs(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_multimodal_image_query_text_docs_test(vllm_runner, model, dtype=dtype)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -603,6 +603,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
    "OpsColQwen3Model": _HfExamplesInfo(
        "OpenSearch-AI/Ops-Colqwen3-4B", trust_remote_code=True
    ),
+    "Qwen3VLNemotronEmbedModel": _HfExamplesInfo(
+        "nvidia/nemotron-colembed-vl-4b-v2",
+    ),
    "SiglipModel": _HfExamplesInfo("google/siglip-base-patch16-224"),
    "PrithviGeoSpatialMAE": _HfExamplesInfo(
        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",