From 5719a4e4e601fb91274294d25370b7aad656d629 Mon Sep 17 00:00:00 2001 From: Kata Coder Date: Sat, 21 Feb 2026 13:01:40 +0900 Subject: [PATCH] [Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574) Signed-off-by: craftsangjae --- docs/models/pooling_models.md | 65 +++++- .../pooling/score/colqwen3_rerank_online.py | 130 +++++++++++- .../multimodal/pooling/test_colqwen3.py | 191 ++++++++++++++++++ tests/models/registry.py | 3 + vllm/entrypoints/llm.py | 29 +-- vllm/entrypoints/pooling/score/serving.py | 83 +++++--- vllm/entrypoints/pooling/score/utils.py | 81 +++++++- vllm/model_executor/models/colqwen3.py | 14 +- vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 1 + 10 files changed, 532 insertions(+), 66 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index d7f13f4e3..a65bf4db5 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -382,6 +382,7 @@ ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends |---|---|---| | `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` | | `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` | +| `Qwen3VLNemotronEmbedModel` | Qwen3-VL | `nvidia/nemotron-colembed-vl-4b-v2`, `nvidia/nemotron-colembed-vl-8b-v2` | Start the server: @@ -389,7 +390,9 @@ Start the server: vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096 ``` -Then you can use the rerank endpoint: +#### Text-only scoring and reranking + +Use the `/rerank` endpoint: ```shell curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ @@ -403,7 +406,7 @@ curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ }' ``` -Or the score endpoint: +Or the `/score` endpoint: ```shell curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ @@ -413,7 +416,57 @@ curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ }' ``` -You can also get the raw token embeddings using the pooling endpoint with `token_embed` task: +#### Multi-modal scoring and reranking (text query × image documents) + +The `/score` and `/rerank` endpoints also accept multi-modal inputs directly. +Pass image documents using the `data_1`/`data_2` (for `/score`) or `documents` (for `/rerank`) fields +with a `content` list containing `image_url` and `text` parts — the same format used by the +OpenAI chat completion API: + +Score a text query against image documents: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "data_1": "Retrieve the city of Beijing", + "data_2": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + } + ] +}' +``` + +Rerank image documents by a text query: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "query": "Retrieve the city of Beijing", + "documents": [ + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + }, + { + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + } + ], + "top_n": 2 +}' +``` + +#### Raw token embeddings + +You can also get the raw token embeddings using the `/pooling` endpoint with `token_embed` task: ```shell curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ @@ -423,7 +476,7 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ }' ``` -For **image inputs**, use the chat-style `messages` field so that the vLLM multimodal processor handles them correctly: +For **image inputs** via the pooling endpoint, use the chat-style `messages` field: ```shell curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ @@ -440,10 +493,10 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ }' ``` -Examples can be found here: +#### Examples - Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py) -- Reranking: [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py) +- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py) ### BAAI/bge-m3 diff --git a/examples/pooling/score/colqwen3_rerank_online.py b/examples/pooling/score/colqwen3_rerank_online.py index ba1df150b..c7ab6e237 100644 --- a/examples/pooling/score/colqwen3_rerank_online.py +++ b/examples/pooling/score/colqwen3_rerank_online.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 """ -Example of using ColQwen3 late interaction model for reranking. +Example of using ColQwen3 late interaction model for reranking and scoring. ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL. It produces per-token embeddings and uses MaxSim scoring for retrieval @@ -14,13 +15,65 @@ Then run this script: python colqwen3_rerank_online.py """ +import base64 +from io import BytesIO + import requests +from PIL import Image MODEL = "TomoroAI/tomoro-colqwen3-embed-4b" BASE_URL = "http://127.0.0.1:8000" headers = {"accept": "application/json", "Content-Type": "application/json"} +# ── Image helpers ────────────────────────────────────────── + + +def load_image(url: str) -> Image.Image: + """Download an image from URL (handles Wikimedia 403).""" + for hdrs in ( + {}, + {"User-Agent": "Mozilla/5.0 (compatible; ColQwen3-demo/1.0)"}, + ): + resp = requests.get(url, headers=hdrs, timeout=15) + if resp.status_code == 403: + continue + resp.raise_for_status() + return Image.open(BytesIO(resp.content)).convert("RGB") + raise RuntimeError(f"Could not fetch image from {url}") + + +def encode_image_base64(image: Image.Image) -> str: + """Encode a PIL image to a base64 data URI.""" + buf = BytesIO() + image.save(buf, format="PNG") + return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode() + + +def make_image_content(image_url: str, text: str = "Describe the image.") -> dict: + """Build a ScoreMultiModalParam dict from an image URL.""" + image = load_image(image_url) + return { + "content": [ + { + "type": "image_url", + "image_url": {"url": encode_image_base64(image)}, + }, + {"type": "text", "text": text}, + ] + } + + +# ── Sample image URLs ───────────────────────────────────── + +IMAGE_URLS = { + "beijing": "https://upload.wikimedia.org/wikipedia/commons/6/61/Beijing_skyline_at_night.JPG", + "london": "https://upload.wikimedia.org/wikipedia/commons/4/49/London_skyline.jpg", + "singapore": "https://upload.wikimedia.org/wikipedia/commons/2/27/Singapore_skyline_2022.jpg", +} + +# ── Text-only examples ──────────────────────────────────── + def rerank_text(): """Text-only reranking via /rerank endpoint.""" @@ -120,11 +173,86 @@ def score_text_top_n(): print(f" {response.text[:300]}") +# ── Multi-modal examples (text query × image documents) ── + + +def score_text_vs_images(): + """Score a text query against image documents via /score.""" + print() + print("=" * 60) + print("4. Multi-modal scoring: text query vs image docs (/score)") + print("=" * 60) + + query = "Retrieve the city of Beijing" + labels = list(IMAGE_URLS.keys()) + print(f"\n Loading {len(labels)} images...") + image_contents = [make_image_content(IMAGE_URLS[name]) for name in labels] + + data = { + "model": MODEL, + "data_1": query, + "data_2": image_contents, + } + + response = requests.post(f"{BASE_URL}/score", headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + print(f'\n Query: "{query}"\n') + for item in result["data"]: + idx = item["index"] + print(f" Doc {idx} [{labels[idx]}] score={item['score']:.4f}") + else: + print(f" Request failed: {response.status_code}") + print(f" {response.text[:300]}") + + +def rerank_text_vs_images(): + """Rerank image documents by a text query via /rerank.""" + print() + print("=" * 60) + print("5. Multi-modal reranking: text query vs image docs (/rerank)") + print("=" * 60) + + query = "Retrieve the city of London" + labels = list(IMAGE_URLS.keys()) + print(f"\n Loading {len(labels)} images...") + image_contents = [make_image_content(IMAGE_URLS[name]) for name in labels] + + data = { + "model": MODEL, + "query": query, + "documents": image_contents, + "top_n": 2, + } + + response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + print(f'\n Query: "{query}"') + print(f" Top {data['top_n']} results:\n") + for item in result["results"]: + idx = item["index"] + print(f" [{item['relevance_score']:.4f}] {labels[idx]}") + else: + print(f" Request failed: {response.status_code}") + print(f" {response.text[:300]}") + + +# ── Main ────────────────────────────────────────────────── + + def main(): + # Text-only rerank_text() score_text() score_text_top_n() + # Multi-modal (text query × image documents) + score_text_vs_images() + rerank_text_vs_images() + if __name__ == "__main__": main() diff --git a/tests/models/multimodal/pooling/test_colqwen3.py b/tests/models/multimodal/pooling/test_colqwen3.py index 51080cc10..0cc4c343b 100644 --- a/tests/models/multimodal/pooling/test_colqwen3.py +++ b/tests/models/multimodal/pooling/test_colqwen3.py @@ -7,19 +7,31 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token embeddings for both text and image inputs. """ +import base64 +from io import BytesIO + import pytest import torch +from PIL import Image + +from vllm.entrypoints.chat_utils import ( + ChatCompletionContentPartImageParam, + ChatCompletionContentPartTextParam, +) +from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam from ....conftest import VllmRunner MODELS = [ "TomoroAI/tomoro-colqwen3-embed-4b", "OpenSearch-AI/Ops-Colqwen3-4B", + "nvidia/nemotron-colembed-vl-4b-v2", ] EMBED_DIMS = { "TomoroAI/tomoro-colqwen3-embed-4b": 320, "OpenSearch-AI/Ops-Colqwen3-4B": 2560, + "nvidia/nemotron-colembed-vl-4b-v2": 2560, } TEXT_QUERIES = [ @@ -33,6 +45,43 @@ TEXT_DOCUMENTS = [ ] DTYPE = "half" +GPU_MEMORY_UTILIZATION = 0.7 + + +def _make_base64_image( + width: int = 64, height: int = 64, color: tuple[int, int, int] = (255, 0, 0) +) -> str: + """Create a small solid-color PNG image and return its base64 data URI.""" + img = Image.new("RGB", (width, height), color) + buf = BytesIO() + img.save(buf, format="PNG") + b64 = base64.b64encode(buf.getvalue()).decode() + return f"data:image/png;base64,{b64}" + + +def _make_image_mm_param( + image_uri: str, + text: str | None = None, +) -> ScoreMultiModalParam: + """Build a ScoreMultiModalParam containing an image (and optional text).""" + content: list = [ + ChatCompletionContentPartImageParam( + type="image_url", + image_url={"url": image_uri}, + ), + ] + if text is not None: + content.append( + ChatCompletionContentPartTextParam(type="text", text=text), + ) + return ScoreMultiModalParam(content=content) + + +def _make_text_mm_param(text: str) -> ScoreMultiModalParam: + """Build a ScoreMultiModalParam containing only text.""" + return ScoreMultiModalParam( + content=[ChatCompletionContentPartTextParam(type="text", text=text)], + ) def _run_token_embed_test( @@ -48,6 +97,7 @@ def _run_token_embed_test( dtype=dtype, max_model_len=4096, enforce_eager=True, + gpu_memory_utilization=GPU_MEMORY_UTILIZATION, ) as vllm_model: outputs = vllm_model.token_embed([TEXT_QUERIES[0]]) @@ -83,6 +133,7 @@ def _run_late_interaction_test( dtype=dtype, max_model_len=4096, enforce_eager=True, + gpu_memory_utilization=GPU_MEMORY_UTILIZATION, ) as vllm_model: q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]]) d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]]) @@ -118,6 +169,7 @@ def _run_relevance_test( dtype=dtype, max_model_len=4096, enforce_eager=True, + gpu_memory_utilization=GPU_MEMORY_UTILIZATION, ) as vllm_model: scores = vllm_model.score(query, documents) @@ -154,3 +206,142 @@ def test_colqwen3_relevance_ordering( dtype: str, ) -> None: _run_relevance_test(vllm_runner, model, dtype=dtype) + + +# ── Multimodal scoring tests ──────────────────────────────── + + +def _run_multimodal_text_query_image_docs_test( + vllm_runner: type[VllmRunner], + model: str, + *, + dtype: str, +) -> None: + """Score a text query against image documents via the multimodal path. + + Verifies that score_data_to_prompts correctly handles image content + and produces valid MaxSim scores. + """ + red_image = _make_base64_image(64, 64, color=(255, 0, 0)) + blue_image = _make_base64_image(64, 64, color=(0, 0, 255)) + + query = "Describe the red object" + image_docs = [ + _make_image_mm_param(red_image), + _make_image_mm_param(blue_image), + ] + + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=4096, + enforce_eager=True, + gpu_memory_utilization=GPU_MEMORY_UTILIZATION, + ) as vllm_model: + scores = vllm_model.llm.score(query, image_docs) + + assert len(scores) == 2 + for s in scores: + assert isinstance(s.outputs.score, float) + + +def _run_multimodal_mixed_docs_test( + vllm_runner: type[VllmRunner], + model: str, + *, + dtype: str, +) -> None: + """Score a text query against a mix of text and image documents. + + Ensures the late-interaction path handles heterogeneous document + types (plain strings alongside ScoreMultiModalParam images) in + a single call. + """ + red_image = _make_base64_image(64, 64, color=(255, 0, 0)) + + query = "What is the capital of France?" + documents: list = [ + "The capital of France is Paris.", + _make_image_mm_param(red_image), + ] + + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=4096, + enforce_eager=True, + gpu_memory_utilization=GPU_MEMORY_UTILIZATION, + ) as vllm_model: + scores = vllm_model.llm.score(query, documents) + + assert len(scores) == 2 + for s in scores: + assert isinstance(s.outputs.score, float) + # Text document about France should score higher than a random image + assert scores[0].outputs.score > scores[1].outputs.score + + +def _run_multimodal_image_query_text_docs_test( + vllm_runner: type[VllmRunner], + model: str, + *, + dtype: str, +) -> None: + """Score an image query against text documents. + + Verifies the reverse direction: multimodal query with text-only + documents through the late-interaction scoring path. + """ + red_image = _make_base64_image(64, 64, color=(255, 0, 0)) + image_query = _make_image_mm_param(red_image, text="red color") + + documents = [ + "A bright red sports car.", + "The weather forecast shows rain tomorrow.", + ] + + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=4096, + enforce_eager=True, + gpu_memory_utilization=GPU_MEMORY_UTILIZATION, + ) as vllm_model: + scores = vllm_model.llm.score(image_query, documents) + + assert len(scores) == 2 + for s in scores: + assert isinstance(s.outputs.score, float) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [DTYPE]) +def test_colqwen3_multimodal_text_query_image_docs( + vllm_runner, + model: str, + dtype: str, +) -> None: + _run_multimodal_text_query_image_docs_test(vllm_runner, model, dtype=dtype) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [DTYPE]) +def test_colqwen3_multimodal_mixed_docs( + vllm_runner, + model: str, + dtype: str, +) -> None: + _run_multimodal_mixed_docs_test(vllm_runner, model, dtype=dtype) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [DTYPE]) +def test_colqwen3_multimodal_image_query_text_docs( + vllm_runner, + model: str, + dtype: str, +) -> None: + _run_multimodal_image_query_text_docs_test(vllm_runner, model, dtype=dtype) diff --git a/tests/models/registry.py b/tests/models/registry.py index de8d33e55..b37dfb6d8 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -603,6 +603,9 @@ _EMBEDDING_EXAMPLE_MODELS = { "OpsColQwen3Model": _HfExamplesInfo( "OpenSearch-AI/Ops-Colqwen3-4B", trust_remote_code=True ), + "Qwen3VLNemotronEmbedModel": _HfExamplesInfo( + "nvidia/nemotron-colembed-vl-4b-v2", + ), "SiglipModel": _HfExamplesInfo("google/siglip-base-patch16-224"), "PrithviGeoSpatialMAE": _HfExamplesInfo( "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f1b32c750..deff23df4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -50,6 +50,7 @@ from vllm.entrypoints.pooling.score.utils import ( compress_token_type_ids, compute_maxsim_score, get_score_prompt, + score_data_to_prompts, validate_score_input, ) from vllm.entrypoints.utils import log_non_default_args @@ -1395,25 +1396,13 @@ class LLM: tokenizer = self.get_tokenizer() - # Extract text from ScoreData - text_1: list[str] = [] - for text in data_1: - if not isinstance(text, str): - raise NotImplementedError( - "Late interaction scores currently do not support multimodal input." - ) - text_1.append(text) + # Convert ScoreData to PromptType (handles both text and multimodal) + model_config = self.model_config + prompts_1 = score_data_to_prompts(data_1, "query", model_config) + prompts_2 = score_data_to_prompts(data_2, "document", model_config) - text_2: list[str] = [] - for text in data_2: - if not isinstance(text, str): - raise NotImplementedError( - "Late interaction scores currently do not support multimodal input." - ) - text_2.append(text) - - encoded_output = self.encode( - text_1 + text_2, + encoded_output: list[PoolingRequestOutput] = self.encode( + prompts_1 + prompts_2, use_tqdm=use_tqdm, lora_request=lora_request, pooling_params=pooling_params, @@ -1421,8 +1410,8 @@ class LLM: tokenization_kwargs=tokenization_kwargs, ) - encoded_output_1 = encoded_output[0 : len(text_1)] - encoded_output_2 = encoded_output[len(text_1) :] + encoded_output_1: list[PoolingRequestOutput] = encoded_output[: len(prompts_1)] + encoded_output_2: list[PoolingRequestOutput] = encoded_output[len(prompts_1) :] if len(encoded_output_1) == 1: encoded_output_1 = encoded_output_1 * len(encoded_output_2) diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index fe01f9cf6..135853d6f 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -33,6 +33,7 @@ from vllm.entrypoints.pooling.score.utils import ( compress_token_type_ids, compute_maxsim_score, get_score_prompt, + parse_score_data_single, validate_score_input, ) from vllm.inputs.data import ProcessorInputs, TokensPrompt, token_inputs @@ -174,6 +175,43 @@ class ServingScores(OpenAIServing): return final_res_batch + def _preprocess_late_interaction_item( + self, + data: ScoreData, + role: str, + request: RerankRequest | ScoreRequest, + tokenizer: TokenizerLike, + tokenization_kwargs: dict[str, Any], + ) -> tuple[str, TokensPrompt]: + """Parse a single ScoreData into a text + optional multimodal + TokensPrompt for late-interaction encoding. + + For plain strings, tokenises directly. + For multimodal content parts, extracts text and multi_modal_data. + """ + model_config = self.model_config + + if isinstance(data, str): + text, mm_data, mm_uuids = data, None, None + else: + text, mm_data, mm_uuids = parse_score_data_single(data, role, model_config) + + prompt_inputs = tokenizer(text, **tokenization_kwargs) + self._validate_input(request, prompt_inputs["input_ids"], text) + + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + ) + + if mm_data is not None: + engine_prompt["multi_modal_data"] = mm_data + if mm_uuids is not None: + engine_prompt["multi_modal_uuids"] = mm_uuids + if request.mm_processor_kwargs is not None: + engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs + + return text, engine_prompt + async def _late_interaction_score( self, data_1: list[ScoreData], @@ -189,37 +227,36 @@ class ServingScores(OpenAIServing): Encodes queries and documents into per-token embeddings, then computes MaxSim: sum over query tokens of max similarity to any document token. """ - input_texts: list[str] = [] - for text in data_1 + data_2: - if not isinstance(text, str): - raise NotImplementedError( - "Late interaction scores currently do not support multimodal input." - ) - input_texts.append(text) - model_config = self.model_config tokenizer = self.renderer.get_tokenizer() + tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs() - encode_async = make_async( - tokenizer.encode, + all_data = data_1 + data_2 + roles = ["query"] * len(data_1) + ["document"] * len(data_2) + + preprocess_async = make_async( + self._preprocess_late_interaction_item, executor=self._tokenizer_executor, ) - tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs() - tokenized_prompts = await asyncio.gather( - *(encode_async(t, **tokenization_kwargs) for t in input_texts) + preprocessed = await asyncio.gather( + *( + preprocess_async( + data=d, + role=r, + request=request, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + ) + for d, r in zip(all_data, roles) + ) ) - engine_prompts: list[ProcessorInputs] = [] - for tok_result, input_text in zip(tokenized_prompts, input_texts): - text_token_prompt = self._validate_input(request, tok_result, input_text) - - engine_prompts.append( - token_inputs( - text_token_prompt["prompt_token_ids"], - prompt=input_text, - ) - ) + input_texts: list[str] = [] + engine_prompts: list[TokensPrompt] = [] + for text, engine_prompt in preprocessed: + input_texts.append(text) + engine_prompts.append(engine_prompt) # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] diff --git a/vllm/entrypoints/pooling/score/utils.py b/vllm/entrypoints/pooling/score/utils.py index 7d00f42f5..60e71ff73 100644 --- a/vllm/entrypoints/pooling/score/utils.py +++ b/vllm/entrypoints/pooling/score/utils.py @@ -21,6 +21,7 @@ from vllm.entrypoints.chat_utils import ( _parse_chat_message_content_parts, ) from vllm.inputs import TokensPrompt +from vllm.inputs.data import PromptType, TextPrompt from vllm.model_executor.models.interfaces import supports_score_template from vllm.multimodal.inputs import MultiModalDataDict, MultiModalUUIDDict from vllm.outputs import PoolingRequestOutput @@ -153,31 +154,91 @@ def validate_score_input( return score_input_1, score_input_2 +def _ensure_str(content: list[ConversationMessage]) -> str: + """Extract a single string prompt from parsed conversation content.""" + assert len(content) == 1 + prompt = content[0]["content"] + if prompt is not None and isinstance(prompt, str): + return cast(str, prompt) + raise ValueError(f"Only string content is supported, but got {content}.") + + def parse_score_data( data_1: ScoreData, data_2: ScoreData, model_config: ModelConfig, ) -> tuple[str, str, MultiModalDataDict | None, MultiModalUUIDDict | None]: + """Parse a query-document pair into text prompts and shared multi-modal + data. + + Uses a **single** :class:`MultiModalItemTracker` so that multi-modal + items from both inputs are merged into one ``mm_data`` dict. This is + the correct behaviour for cross-encoder scoring, where query and + document are concatenated into a single model prompt. + """ mm_tracker = MultiModalItemTracker(model_config) content_1 = _parse_score_content("query", data_1, mm_tracker) content_2 = _parse_score_content("document", data_2, mm_tracker) - def ensure_str(content: list[ConversationMessage]) -> str: - assert len(content) == 1 - prompt = content[0]["content"] - if prompt is not None and isinstance(prompt, str): - return cast(str, prompt) - else: - raise ValueError(f"Only string content is supported, but got {content}.") - - prompt_1 = ensure_str(content_1) - prompt_2 = ensure_str(content_2) + prompt_1 = _ensure_str(content_1) + prompt_2 = _ensure_str(content_2) mm_items, mm_uuids = mm_tracker.resolve_items() return prompt_1, prompt_2, mm_items, mm_uuids +def parse_score_data_single( + data: ScoreData, + role: str, + model_config: ModelConfig, +) -> tuple[str, MultiModalDataDict | None, MultiModalUUIDDict | None]: + """Parse **one** ScoreData into a text prompt and its own multi-modal + data. + + Unlike :func:`parse_score_data`, each call creates an **independent** + :class:`MultiModalItemTracker` so multi-modal items are kept separate. + This is the correct behaviour for late-interaction scoring, where + query and document are encoded independently. + """ + mm_tracker = MultiModalItemTracker(model_config) + content = _parse_score_content(role, data, mm_tracker) + + prompt = _ensure_str(content) + mm_items, mm_uuids = mm_tracker.resolve_items() + return prompt, mm_items, mm_uuids + + +def score_data_to_prompts( + data_list: list[ScoreData], + role: str, + model_config: ModelConfig, +) -> list[PromptType]: + """Convert a list of ScoreData into PromptType objects. + + For plain text inputs, returns the string directly. + For multimodal inputs (list of content parts), parses them into + a :class:`TextPrompt` with attached ``multi_modal_data`` / + ``multi_modal_uuids``. + + This is used by late-interaction scoring where each query/document + is encoded independently. + """ + prompts: list[PromptType] = [] + for data in data_list: + if isinstance(data, str): + prompts.append(data) + else: + text, mm_data, mm_uuids = parse_score_data_single(data, role, model_config) + prompt: TextPrompt = TextPrompt(prompt=text) + if mm_data is not None: + prompt["multi_modal_data"] = mm_data + if mm_uuids is not None: + prompt["multi_modal_uuids"] = mm_uuids + prompts.append(prompt) + return prompts + + def _parse_score_content( role: str, data: ScoreData, diff --git a/vllm/model_executor/models/colqwen3.py b/vllm/model_executor/models/colqwen3.py index f60d93f8e..7513c01e8 100644 --- a/vllm/model_executor/models/colqwen3.py +++ b/vllm/model_executor/models/colqwen3.py @@ -16,6 +16,7 @@ Based on: Qwen3-VL backbone with custom text projection Target models: - TomoroAI/tomoro-colqwen3-embed-8b - OpenSearch-AI/Ops-Colqwen3-4B +- nvidia/nemotron-colembed-vl-4b-v2 """ from collections.abc import Iterable, Mapping @@ -229,13 +230,14 @@ class ColQwen3Model( if not isinstance(hidden_states, torch.Tensor): return hidden_states # type: ignore - proj_dtype = self.custom_text_proj.weight.dtype # type: ignore - if hidden_states.dtype != proj_dtype: - hidden_states = hidden_states.to(proj_dtype) + if self.custom_text_proj is not None: + proj_dtype = self.custom_text_proj.weight.dtype + if hidden_states.dtype != proj_dtype: + hidden_states = hidden_states.to(proj_dtype) + hidden_states = self.custom_text_proj(hidden_states) - # Project to embedding dimension and L2 normalize - proj = self.custom_text_proj(hidden_states) # type: ignore - return torch.nn.functional.normalize(proj, p=2, dim=-1) + # L2 normalize + return torch.nn.functional.normalize(hidden_states, p=2, dim=-1) # Names used for the projection layer across different ColQwen3 variants _PROJ_LAYER_NAMES = { diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index ca9468a19..598df91d9 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -256,6 +256,7 @@ _EMBEDDING_MODELS = { "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "ColQwen3": ("colqwen3", "ColQwen3Model"), "OpsColQwen3Model": ("colqwen3", "ColQwen3Model"), + "Qwen3VLNemotronEmbedModel": ("colqwen3", "ColQwen3Model"), "SiglipModel": ("siglip", "SiglipEmbeddingModel"), # Technically Terratorch models work on images, both in # input and output. I am adding it here because it piggy-backs on embedding diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index ece5614fc..852e1d2a3 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -76,6 +76,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( chatglm="ChatGLMConfig", colqwen3="ColQwen3Config", ops_colqwen3="OpsColQwen3Config", + qwen3_vl_nemotron_embed="Qwen3VLNemotronEmbedConfig", deepseek_vl_v2="DeepseekVLV2Config", deepseek_v32="DeepseekV3Config", flex_olmo="FlexOlmoConfig",