[Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574)

Signed-off-by: craftsangjae <craftsangjae@gmail.com>
This commit is contained in:
Kata Coder
2026-02-21 13:01:40 +09:00
committed by GitHub
parent 11be2c74dc
commit 5719a4e4e6
10 changed files with 532 additions and 66 deletions

View File

@@ -7,19 +7,31 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
embeddings for both text and image inputs.
"""
import base64
from io import BytesIO
import pytest
import torch
from PIL import Image
from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
)
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from ....conftest import VllmRunner
MODELS = [
"TomoroAI/tomoro-colqwen3-embed-4b",
"OpenSearch-AI/Ops-Colqwen3-4B",
"nvidia/nemotron-colembed-vl-4b-v2",
]
EMBED_DIMS = {
"TomoroAI/tomoro-colqwen3-embed-4b": 320,
"OpenSearch-AI/Ops-Colqwen3-4B": 2560,
"nvidia/nemotron-colembed-vl-4b-v2": 2560,
}
TEXT_QUERIES = [
@@ -33,6 +45,43 @@ TEXT_DOCUMENTS = [
]
DTYPE = "half"
GPU_MEMORY_UTILIZATION = 0.7
def _make_base64_image(
width: int = 64, height: int = 64, color: tuple[int, int, int] = (255, 0, 0)
) -> str:
"""Create a small solid-color PNG image and return its base64 data URI."""
img = Image.new("RGB", (width, height), color)
buf = BytesIO()
img.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode()
return f"data:image/png;base64,{b64}"
def _make_image_mm_param(
image_uri: str,
text: str | None = None,
) -> ScoreMultiModalParam:
"""Build a ScoreMultiModalParam containing an image (and optional text)."""
content: list = [
ChatCompletionContentPartImageParam(
type="image_url",
image_url={"url": image_uri},
),
]
if text is not None:
content.append(
ChatCompletionContentPartTextParam(type="text", text=text),
)
return ScoreMultiModalParam(content=content)
def _make_text_mm_param(text: str) -> ScoreMultiModalParam:
"""Build a ScoreMultiModalParam containing only text."""
return ScoreMultiModalParam(
content=[ChatCompletionContentPartTextParam(type="text", text=text)],
)
def _run_token_embed_test(
@@ -48,6 +97,7 @@ def _run_token_embed_test(
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
) as vllm_model:
outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
@@ -83,6 +133,7 @@ def _run_late_interaction_test(
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
) as vllm_model:
q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]])
@@ -118,6 +169,7 @@ def _run_relevance_test(
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
) as vllm_model:
scores = vllm_model.score(query, documents)
@@ -154,3 +206,142 @@ def test_colqwen3_relevance_ordering(
dtype: str,
) -> None:
_run_relevance_test(vllm_runner, model, dtype=dtype)
# ── Multimodal scoring tests ────────────────────────────────
def _run_multimodal_text_query_image_docs_test(
vllm_runner: type[VllmRunner],
model: str,
*,
dtype: str,
) -> None:
"""Score a text query against image documents via the multimodal path.
Verifies that score_data_to_prompts correctly handles image content
and produces valid MaxSim scores.
"""
red_image = _make_base64_image(64, 64, color=(255, 0, 0))
blue_image = _make_base64_image(64, 64, color=(0, 0, 255))
query = "Describe the red object"
image_docs = [
_make_image_mm_param(red_image),
_make_image_mm_param(blue_image),
]
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
) as vllm_model:
scores = vllm_model.llm.score(query, image_docs)
assert len(scores) == 2
for s in scores:
assert isinstance(s.outputs.score, float)
def _run_multimodal_mixed_docs_test(
vllm_runner: type[VllmRunner],
model: str,
*,
dtype: str,
) -> None:
"""Score a text query against a mix of text and image documents.
Ensures the late-interaction path handles heterogeneous document
types (plain strings alongside ScoreMultiModalParam images) in
a single call.
"""
red_image = _make_base64_image(64, 64, color=(255, 0, 0))
query = "What is the capital of France?"
documents: list = [
"The capital of France is Paris.",
_make_image_mm_param(red_image),
]
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
) as vllm_model:
scores = vllm_model.llm.score(query, documents)
assert len(scores) == 2
for s in scores:
assert isinstance(s.outputs.score, float)
# Text document about France should score higher than a random image
assert scores[0].outputs.score > scores[1].outputs.score
def _run_multimodal_image_query_text_docs_test(
vllm_runner: type[VllmRunner],
model: str,
*,
dtype: str,
) -> None:
"""Score an image query against text documents.
Verifies the reverse direction: multimodal query with text-only
documents through the late-interaction scoring path.
"""
red_image = _make_base64_image(64, 64, color=(255, 0, 0))
image_query = _make_image_mm_param(red_image, text="red color")
documents = [
"A bright red sports car.",
"The weather forecast shows rain tomorrow.",
]
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True,
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
) as vllm_model:
scores = vllm_model.llm.score(image_query, documents)
assert len(scores) == 2
for s in scores:
assert isinstance(s.outputs.score, float)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [DTYPE])
def test_colqwen3_multimodal_text_query_image_docs(
vllm_runner,
model: str,
dtype: str,
) -> None:
_run_multimodal_text_query_image_docs_test(vllm_runner, model, dtype=dtype)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [DTYPE])
def test_colqwen3_multimodal_mixed_docs(
vllm_runner,
model: str,
dtype: str,
) -> None:
_run_multimodal_mixed_docs_test(vllm_runner, model, dtype=dtype)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [DTYPE])
def test_colqwen3_multimodal_image_query_text_docs(
vllm_runner,
model: str,
dtype: str,
) -> None:
_run_multimodal_image_query_text_docs_test(vllm_runner, model, dtype=dtype)

View File

@@ -603,6 +603,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
"OpsColQwen3Model": _HfExamplesInfo(
"OpenSearch-AI/Ops-Colqwen3-4B", trust_remote_code=True
),
"Qwen3VLNemotronEmbedModel": _HfExamplesInfo(
"nvidia/nemotron-colembed-vl-4b-v2",
),
"SiglipModel": _HfExamplesInfo("google/siglip-base-patch16-224"),
"PrithviGeoSpatialMAE": _HfExamplesInfo(
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",