[Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574)
Signed-off-by: craftsangjae <craftsangjae@gmail.com>
This commit is contained in:
@@ -7,19 +7,31 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
|
||||
embeddings for both text and image inputs.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartTextParam,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
|
||||
MODELS = [
|
||||
"TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"OpenSearch-AI/Ops-Colqwen3-4B",
|
||||
"nvidia/nemotron-colembed-vl-4b-v2",
|
||||
]
|
||||
|
||||
EMBED_DIMS = {
|
||||
"TomoroAI/tomoro-colqwen3-embed-4b": 320,
|
||||
"OpenSearch-AI/Ops-Colqwen3-4B": 2560,
|
||||
"nvidia/nemotron-colembed-vl-4b-v2": 2560,
|
||||
}
|
||||
|
||||
TEXT_QUERIES = [
|
||||
@@ -33,6 +45,43 @@ TEXT_DOCUMENTS = [
|
||||
]
|
||||
|
||||
DTYPE = "half"
|
||||
GPU_MEMORY_UTILIZATION = 0.7
|
||||
|
||||
|
||||
def _make_base64_image(
|
||||
width: int = 64, height: int = 64, color: tuple[int, int, int] = (255, 0, 0)
|
||||
) -> str:
|
||||
"""Create a small solid-color PNG image and return its base64 data URI."""
|
||||
img = Image.new("RGB", (width, height), color)
|
||||
buf = BytesIO()
|
||||
img.save(buf, format="PNG")
|
||||
b64 = base64.b64encode(buf.getvalue()).decode()
|
||||
return f"data:image/png;base64,{b64}"
|
||||
|
||||
|
||||
def _make_image_mm_param(
|
||||
image_uri: str,
|
||||
text: str | None = None,
|
||||
) -> ScoreMultiModalParam:
|
||||
"""Build a ScoreMultiModalParam containing an image (and optional text)."""
|
||||
content: list = [
|
||||
ChatCompletionContentPartImageParam(
|
||||
type="image_url",
|
||||
image_url={"url": image_uri},
|
||||
),
|
||||
]
|
||||
if text is not None:
|
||||
content.append(
|
||||
ChatCompletionContentPartTextParam(type="text", text=text),
|
||||
)
|
||||
return ScoreMultiModalParam(content=content)
|
||||
|
||||
|
||||
def _make_text_mm_param(text: str) -> ScoreMultiModalParam:
|
||||
"""Build a ScoreMultiModalParam containing only text."""
|
||||
return ScoreMultiModalParam(
|
||||
content=[ChatCompletionContentPartTextParam(type="text", text=text)],
|
||||
)
|
||||
|
||||
|
||||
def _run_token_embed_test(
|
||||
@@ -48,6 +97,7 @@ def _run_token_embed_test(
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
|
||||
|
||||
@@ -83,6 +133,7 @@ def _run_late_interaction_test(
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
|
||||
) as vllm_model:
|
||||
q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
|
||||
d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]])
|
||||
@@ -118,6 +169,7 @@ def _run_relevance_test(
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
|
||||
) as vllm_model:
|
||||
scores = vllm_model.score(query, documents)
|
||||
|
||||
@@ -154,3 +206,142 @@ def test_colqwen3_relevance_ordering(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
_run_relevance_test(vllm_runner, model, dtype=dtype)
|
||||
|
||||
|
||||
# ── Multimodal scoring tests ────────────────────────────────
|
||||
|
||||
|
||||
def _run_multimodal_text_query_image_docs_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Score a text query against image documents via the multimodal path.
|
||||
|
||||
Verifies that score_data_to_prompts correctly handles image content
|
||||
and produces valid MaxSim scores.
|
||||
"""
|
||||
red_image = _make_base64_image(64, 64, color=(255, 0, 0))
|
||||
blue_image = _make_base64_image(64, 64, color=(0, 0, 255))
|
||||
|
||||
query = "Describe the red object"
|
||||
image_docs = [
|
||||
_make_image_mm_param(red_image),
|
||||
_make_image_mm_param(blue_image),
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
|
||||
) as vllm_model:
|
||||
scores = vllm_model.llm.score(query, image_docs)
|
||||
|
||||
assert len(scores) == 2
|
||||
for s in scores:
|
||||
assert isinstance(s.outputs.score, float)
|
||||
|
||||
|
||||
def _run_multimodal_mixed_docs_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Score a text query against a mix of text and image documents.
|
||||
|
||||
Ensures the late-interaction path handles heterogeneous document
|
||||
types (plain strings alongside ScoreMultiModalParam images) in
|
||||
a single call.
|
||||
"""
|
||||
red_image = _make_base64_image(64, 64, color=(255, 0, 0))
|
||||
|
||||
query = "What is the capital of France?"
|
||||
documents: list = [
|
||||
"The capital of France is Paris.",
|
||||
_make_image_mm_param(red_image),
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
|
||||
) as vllm_model:
|
||||
scores = vllm_model.llm.score(query, documents)
|
||||
|
||||
assert len(scores) == 2
|
||||
for s in scores:
|
||||
assert isinstance(s.outputs.score, float)
|
||||
# Text document about France should score higher than a random image
|
||||
assert scores[0].outputs.score > scores[1].outputs.score
|
||||
|
||||
|
||||
def _run_multimodal_image_query_text_docs_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Score an image query against text documents.
|
||||
|
||||
Verifies the reverse direction: multimodal query with text-only
|
||||
documents through the late-interaction scoring path.
|
||||
"""
|
||||
red_image = _make_base64_image(64, 64, color=(255, 0, 0))
|
||||
image_query = _make_image_mm_param(red_image, text="red color")
|
||||
|
||||
documents = [
|
||||
"A bright red sports car.",
|
||||
"The weather forecast shows rain tomorrow.",
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
|
||||
) as vllm_model:
|
||||
scores = vllm_model.llm.score(image_query, documents)
|
||||
|
||||
assert len(scores) == 2
|
||||
for s in scores:
|
||||
assert isinstance(s.outputs.score, float)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [DTYPE])
|
||||
def test_colqwen3_multimodal_text_query_image_docs(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
_run_multimodal_text_query_image_docs_test(vllm_runner, model, dtype=dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [DTYPE])
|
||||
def test_colqwen3_multimodal_mixed_docs(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
_run_multimodal_mixed_docs_test(vllm_runner, model, dtype=dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [DTYPE])
|
||||
def test_colqwen3_multimodal_image_query_text_docs(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
_run_multimodal_image_query_text_docs_test(vllm_runner, model, dtype=dtype)
|
||||
|
||||
Reference in New Issue
Block a user