diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 1f17fca69..d7f13f4e3 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -374,6 +374,77 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py) +### ColQwen3 Multi-Modal Late Interaction Models + +ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone. + +| Architecture | Backbone | Example HF Models | +|---|---|---| +| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` | +| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` | + +Start the server: + +```shell +vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096 +``` + +Then you can use the rerank endpoint: + +```shell +curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "query": "What is machine learning?", + "documents": [ + "Machine learning is a subset of artificial intelligence.", + "Python is a programming language.", + "Deep learning uses neural networks." + ] +}' +``` + +Or the score endpoint: + +```shell +curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "text_1": "What is the capital of France?", + "text_2": ["The capital of France is Paris.", "Python is a programming language."] +}' +``` + +You can also get the raw token embeddings using the pooling endpoint with `token_embed` task: + +```shell +curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "input": "What is machine learning?", + "task": "token_embed" +}' +``` + +For **image inputs**, use the chat-style `messages` field so that the vLLM multimodal processor handles them correctly: + +```shell +curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ + "model": "TomoroAI/tomoro-colqwen3-embed-4b", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,"}}, + {"type": "text", "text": "Describe the image."} + ] + } + ] +}' +``` + +Examples can be found here: + +- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py) +- Reranking: [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py) + ### BAAI/bge-m3 The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json` diff --git a/examples/pooling/score/colqwen3_rerank_online.py b/examples/pooling/score/colqwen3_rerank_online.py new file mode 100644 index 000000000..ba1df150b --- /dev/null +++ b/examples/pooling/score/colqwen3_rerank_online.py @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Example of using ColQwen3 late interaction model for reranking. + +ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL. +It produces per-token embeddings and uses MaxSim scoring for retrieval +and reranking. Supports both text and image inputs. + +Start the server with: + vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 50000 + +Then run this script: + python colqwen3_rerank_online.py +""" + +import requests + +MODEL = "TomoroAI/tomoro-colqwen3-embed-4b" +BASE_URL = "http://127.0.0.1:8000" + +headers = {"accept": "application/json", "Content-Type": "application/json"} + + +def rerank_text(): + """Text-only reranking via /rerank endpoint.""" + print("=" * 60) + print("1. Text reranking (/rerank)") + print("=" * 60) + + data = { + "model": MODEL, + "query": "What is machine learning?", + "documents": [ + "Machine learning is a subset of artificial intelligence.", + "Python is a programming language.", + "Deep learning uses neural networks for complex tasks.", + "The weather today is sunny.", + ], + } + + response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + print("\n Ranked documents (most relevant first):") + for item in result["results"]: + doc_idx = item["index"] + score = item["relevance_score"] + print(f" [{score:.4f}] {data['documents'][doc_idx]}") + else: + print(f" Request failed: {response.status_code}") + print(f" {response.text[:300]}") + + +def score_text(): + """Text-only scoring via /score endpoint.""" + print() + print("=" * 60) + print("2. Text scoring (/score)") + print("=" * 60) + + query = "What is the capital of France?" + documents = [ + "The capital of France is Paris.", + "Berlin is the capital of Germany.", + "Python is a programming language.", + ] + + data = { + "model": MODEL, + "text_1": query, + "text_2": documents, + } + + response = requests.post(f"{BASE_URL}/score", headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + print(f"\n Query: {query}\n") + for item in result["data"]: + idx = item["index"] + score = item["score"] + print(f" Doc {idx} (score={score:.4f}): {documents[idx]}") + else: + print(f" Request failed: {response.status_code}") + print(f" {response.text[:300]}") + + +def score_text_top_n(): + """Text reranking with top_n filtering via /rerank endpoint.""" + print() + print("=" * 60) + print("3. Text reranking with top_n=2 (/rerank)") + print("=" * 60) + + data = { + "model": MODEL, + "query": "What is the capital of France?", + "documents": [ + "The capital of France is Paris.", + "Berlin is the capital of Germany.", + "Python is a programming language.", + "The Eiffel Tower is in Paris.", + ], + "top_n": 2, + } + + response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data) + + if response.status_code == 200: + result = response.json() + print(f"\n Top {data['top_n']} results:") + for item in result["results"]: + doc_idx = item["index"] + score = item["relevance_score"] + print(f" [{score:.4f}] {data['documents'][doc_idx]}") + else: + print(f" Request failed: {response.status_code}") + print(f" {response.text[:300]}") + + +def main(): + rerank_text() + score_text() + score_text_top_n() + + +if __name__ == "__main__": + main() diff --git a/examples/pooling/token_embed/colqwen3_token_embed_online.py b/examples/pooling/token_embed/colqwen3_token_embed_online.py new file mode 100644 index 000000000..20445742f --- /dev/null +++ b/examples/pooling/token_embed/colqwen3_token_embed_online.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 + +""" +Example online usage of Pooling API for ColQwen3 multi-vector retrieval. + +ColQwen3 is a multi-modal late interaction model based on Qwen3-VL that +produces per-token embeddings (320-dim, L2-normalized) for both text and +image inputs. Similarity is computed via MaxSim scoring. + +This example mirrors the official TomoroAI inference code +(https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-4b) but uses the +vLLM serving API instead of local HuggingFace model loading. + +Start the server with: + vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096 + +Then run this script: + python colqwen3_token_embed_online.py +""" + +import argparse +import base64 +from io import BytesIO + +import numpy as np +import requests +from PIL import Image + +# ── Helpers ───────────────────────────────────────────────── + + +def post_http_request(payload: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + return requests.post(api_url, headers=headers, json=payload) + + +def load_image(url: str) -> Image.Image: + """Download an image from URL (handles Wikimedia 403).""" + for hdrs in ({}, {"User-Agent": "Mozilla/5.0 (compatible; ColQwen3-demo/1.0)"}): + resp = requests.get(url, headers=hdrs, timeout=10) + if resp.status_code == 403: + continue + resp.raise_for_status() + return Image.open(BytesIO(resp.content)).convert("RGB") + raise RuntimeError(f"Could not fetch image from {url}") + + +def encode_image_base64(image: Image.Image) -> str: + """Encode a PIL image to a base64 data URI.""" + buf = BytesIO() + image.save(buf, format="PNG") + return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode() + + +def compute_maxsim(q_emb: np.ndarray, d_emb: np.ndarray) -> float: + """Compute ColBERT-style MaxSim score between query and document.""" + sim = q_emb @ d_emb.T + return float(sim.max(axis=-1).sum()) + + +# ── Encode functions ──────────────────────────────────────── + + +def encode_queries(texts: list[str], model: str, api_url: str) -> list[np.ndarray]: + """Encode text queries → list of multi-vector embeddings.""" + resp = post_http_request({"model": model, "input": texts}, api_url) + return [np.array(item["data"]) for item in resp.json()["data"]] + + +def encode_images(image_urls: list[str], model: str, api_url: str) -> list[np.ndarray]: + """Encode image documents → list of multi-vector embeddings. + + Images are sent via the chat-style `messages` field so that the + vLLM multimodal processor handles them correctly. + """ + embeddings = [] + for url in image_urls: + print(f" Loading: {url.split('/')[-1]}...") + image = load_image(url) + image_uri = encode_image_base64(image) + resp = post_http_request( + { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_uri}}, + {"type": "text", "text": "Describe the image."}, + ], + } + ], + }, + api_url, + ) + result = resp.json() + if resp.status_code != 200 or "data" not in result: + print(f" Error ({resp.status_code}): {str(result)[:200]}") + continue + embeddings.append(np.array(result["data"][0]["data"])) + return embeddings + + +# ── Main ──────────────────────────────────────────────────── + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--model", + type=str, + default="TomoroAI/tomoro-colqwen3-embed-4b", + ) + return parser.parse_args() + + +def main(args): + pooling_url = f"http://{args.host}:{args.port}/pooling" + score_url = f"http://{args.host}:{args.port}/score" + model = args.model + + # Same sample data as the official TomoroAI example + queries = [ + "Retrieve the city of Singapore", + "Retrieve the city of Beijing", + "Retrieve the city of London", + ] + image_urls = [ + "https://upload.wikimedia.org/wikipedia/commons/2/27/Singapore_skyline_2022.jpg", + "https://upload.wikimedia.org/wikipedia/commons/6/61/Beijing_skyline_at_night.JPG", + "https://upload.wikimedia.org/wikipedia/commons/4/49/London_skyline.jpg", + ] + + # ── 1) Text query embeddings ──────────────────────────── + print("=" * 60) + print("1. Encode text queries (multi-vector)") + print("=" * 60) + query_embeddings = encode_queries(queries, model, pooling_url) + for i, emb in enumerate(query_embeddings): + norm = float(np.linalg.norm(emb[0])) + print(f' Query {i}: {emb.shape} (L2 norm: {norm:.4f}) "{queries[i]}"') + + # ── 2) Image document embeddings ──────────────────────── + print() + print("=" * 60) + print("2. Encode image documents (multi-vector)") + print("=" * 60) + doc_embeddings = encode_images(image_urls, model, pooling_url) + for i, emb in enumerate(doc_embeddings): + print(f" Doc {i}: {emb.shape} {image_urls[i].split('/')[-1]}") + + # ── 3) Cross-modal MaxSim scoring ─────────────────────── + if doc_embeddings: + print() + print("=" * 60) + print("3. Cross-modal MaxSim scores (text queries × image docs)") + print("=" * 60) + # Header + print(f"{'':>35s}", end="") + for j in range(len(doc_embeddings)): + print(f" Doc {j:>2d}", end="") + print() + # Score matrix + for i, q_emb in enumerate(query_embeddings): + print(f" {queries[i]:<33s}", end="") + for j, d_emb in enumerate(doc_embeddings): + score = compute_maxsim(q_emb, d_emb) + print(f" {score:6.2f}", end="") + print() + + # ── 4) Text-only /score endpoint ──────────────────────── + print() + print("=" * 60) + print("4. Text-only late interaction scoring (/score endpoint)") + print("=" * 60) + text_query = "What is the capital of France?" + text_docs = [ + "The capital of France is Paris.", + "Berlin is the capital of Germany.", + "Python is a programming language.", + ] + resp = post_http_request( + {"model": model, "text_1": text_query, "text_2": text_docs}, + score_url, + ) + print(f' Query: "{text_query}"\n') + for item in resp.json()["data"]: + idx = item["index"] + print(f" Doc {idx} (score={item['score']:.4f}): {text_docs[idx]}") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/models/multimodal/pooling/test_colqwen3.py b/tests/models/multimodal/pooling/test_colqwen3.py new file mode 100644 index 000000000..51080cc10 --- /dev/null +++ b/tests/models/multimodal/pooling/test_colqwen3.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for ColQwen3 late interaction model for multi-modal retrieval. + +ColQwen3 is a multi-vector retrieval model based on Qwen3-VL backbone with +ColBERT-style late interaction scoring (MaxSim). It produces per-token +embeddings for both text and image inputs. +""" + +import pytest +import torch + +from ....conftest import VllmRunner + +MODELS = [ + "TomoroAI/tomoro-colqwen3-embed-4b", + "OpenSearch-AI/Ops-Colqwen3-4B", +] + +EMBED_DIMS = { + "TomoroAI/tomoro-colqwen3-embed-4b": 320, + "OpenSearch-AI/Ops-Colqwen3-4B": 2560, +} + +TEXT_QUERIES = [ + "What is the capital of France?", + "Describe the contents of the document.", +] + +TEXT_DOCUMENTS = [ + "The capital of France is Paris.", + "This document contains important financial data.", +] + +DTYPE = "half" + + +def _run_token_embed_test( + vllm_runner: type[VllmRunner], + model: str, + *, + dtype: str, +) -> None: + """Verify per-token embedding shape and L2 normalization.""" + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=4096, + enforce_eager=True, + ) as vllm_model: + outputs = vllm_model.token_embed([TEXT_QUERIES[0]]) + + assert len(outputs) == 1 + emb = torch.tensor(outputs[0]) + # Token embeddings should be 2D: [num_tokens, embed_dim] + assert emb.dim() == 2 + assert emb.shape[1] == EMBED_DIMS[model] + assert emb.shape[0] > 1 + + # Verify L2 normalization + norms = torch.norm(emb, p=2, dim=-1) + torch.testing.assert_close( + norms, + torch.ones_like(norms), + rtol=1e-2, + atol=1e-2, + ) + + +def _run_late_interaction_test( + vllm_runner: type[VllmRunner], + model: str, + *, + dtype: str, +) -> None: + """Verify MaxSim scoring matches manual computation.""" + from vllm.entrypoints.pooling.score.utils import compute_maxsim_score + + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=4096, + enforce_eager=True, + ) as vllm_model: + q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]]) + d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]]) + + q_emb = torch.tensor(q_outputs[0]) + d_emb = torch.tensor(d_outputs[0]) + + manual_score = compute_maxsim_score(q_emb, d_emb).item() + + vllm_scores = vllm_model.score(TEXT_QUERIES[0], TEXT_DOCUMENTS[0]) + + assert len(vllm_scores) == 1 + assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01) + + +def _run_relevance_test( + vllm_runner: type[VllmRunner], + model: str, + *, + dtype: str, +) -> None: + """Verify that relevant documents score higher than irrelevant ones.""" + query = "What is machine learning?" + documents = [ + "Machine learning is a subset of artificial intelligence.", + "The weather forecast shows rain tomorrow.", + "Deep learning uses neural networks for complex tasks.", + ] + + with vllm_runner( + model, + runner="pooling", + dtype=dtype, + max_model_len=4096, + enforce_eager=True, + ) as vllm_model: + scores = vllm_model.score(query, documents) + + assert len(scores) == 3 + assert scores[0] > scores[1], "ML doc should score higher than weather doc" + assert scores[2] > scores[1], "DL doc should score higher than weather doc" + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [DTYPE]) +def test_colqwen3_token_embed( + vllm_runner, + model: str, + dtype: str, +) -> None: + _run_token_embed_test(vllm_runner, model, dtype=dtype) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [DTYPE]) +def test_colqwen3_late_interaction_scoring( + vllm_runner, + model: str, + dtype: str, +) -> None: + _run_late_interaction_test(vllm_runner, model, dtype=dtype) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [DTYPE]) +def test_colqwen3_relevance_ordering( + vllm_runner, + model: str, + dtype: str, +) -> None: + _run_relevance_test(vllm_runner, model, dtype=dtype) diff --git a/tests/models/registry.py b/tests/models/registry.py index fb05c5803..16d33bb5b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -597,6 +597,12 @@ _EMBEDDING_EXAMPLE_MODELS = { "TIGER-Lab/VLM2Vec-Full", trust_remote_code=True ), "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), + "ColQwen3": _HfExamplesInfo( + "TomoroAI/tomoro-colqwen3-embed-4b", trust_remote_code=True + ), + "OpsColQwen3Model": _HfExamplesInfo( + "OpenSearch-AI/Ops-Colqwen3-4B", trust_remote_code=True + ), "SiglipModel": _HfExamplesInfo("google/siglip-base-patch16-224"), "PrithviGeoSpatialMAE": _HfExamplesInfo( "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", diff --git a/vllm/model_executor/models/colqwen3.py b/vllm/model_executor/models/colqwen3.py new file mode 100644 index 000000000..f60d93f8e --- /dev/null +++ b/vllm/model_executor/models/colqwen3.py @@ -0,0 +1,306 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +ColQwen3 late interaction model for multi-modal retrieval and reranking. + +ColQwen3 extends Qwen3-VL with a ColBERT-style late interaction head, +producing per-token embeddings for both text and image inputs. It uses +MaxSim scoring for retrieval/reranking tasks. + +This model supports the "token_embed" pooling task and is designed for +multi-vector retrieval of documents containing both text and images. + +Reference: https://arxiv.org/abs/2407.01449 (ColPali) +Based on: Qwen3-VL backbone with custom text projection + +Target models: +- TomoroAI/tomoro-colqwen3-embed-8b +- OpenSearch-AI/Ops-Colqwen3-4B +""" + +from collections.abc import Iterable, Mapping +from typing import ClassVar, Literal + +import torch +import torch.nn as nn +from transformers.models.qwen3_vl import Qwen3VLProcessor + +from vllm.config import VllmConfig +from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.multimodal import MULTIMODAL_REGISTRY + +from .interfaces_base import default_pooling_type +from .qwen2_vl import Qwen2VLMultiModalDataParser +from .qwen3_vl import ( + Qwen3VLDummyInputsBuilder, + Qwen3VLForConditionalGeneration, + Qwen3VLMultiModalProcessor, + Qwen3VLProcessingInfo, +) +from .utils import AutoWeightsLoader, WeightsMapper + + +class ColQwen3ProcessingInfo(Qwen3VLProcessingInfo): + """Processing info for ColQwen3 models. + + ColQwen3 models (TomoroAI, OpenSearch-AI, etc.) use custom HuggingFace + configs (e.g. ColQwen3Config, OpsColQwen3Config) that are not instances + of Qwen3VLConfig. We override get_hf_config() and get_hf_processor() + to skip the strict type check, similar to OpenCUAProcessingInfo. + """ + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor: + # Force standard Qwen3VLProcessor even when trust_remote_code=True. + # ColQwen3 custom processors (e.g. ColQwen3Processor) have + # incompatible interfaces with vLLM's Qwen3VLMultiModalProcessor. + # The standard Qwen3VLProcessor handles both text and image inputs + # correctly for the Qwen3-VL backbone. + return self.ctx.get_hf_processor( + Qwen3VLProcessor, + use_fast=kwargs.pop("use_fast", True), + **kwargs, + ) + + @property + def _supports_video(self) -> bool: + """Check if the HF processor supports video inputs.""" + return hasattr(self.get_hf_processor(), "video_processor") + + def get_video_processor(self, **kwargs: object): + if not self._supports_video: + raise AttributeError( + f"The processor for {self.ctx.model_config.model} does not " + "support video inputs (no video_processor attribute)." + ) + return self.get_hf_processor(**kwargs).video_processor # type: ignore[attr-defined] + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + limits: dict[str, int | None] = {"image": None} + if self._supports_video: + limits["video"] = None + return limits + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_image_tokens = self.get_max_image_tokens() + result: dict[str, int] = {"image": max_image_tokens} + if self._supports_video: + max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) + result["video"] = max_video_tokens + return result + + def get_data_parser(self): + hf_config = self.get_hf_config() + spatial_merge_size = hf_config.vision_config.spatial_merge_size + return Qwen2VLMultiModalDataParser( + spatial_merge_size, + video_needs_metadata=self._supports_video, + expected_hidden_size=self._get_expected_hidden_size(), + ) + + +@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL") +@MULTIMODAL_REGISTRY.register_processor( + Qwen3VLMultiModalProcessor, + info=ColQwen3ProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder, +) +class ColQwen3Model( + Qwen3VLForConditionalGeneration, +): + """ColQwen3 late interaction model for multi-modal retrieval/reranking. + + This model extends Qwen3VLForConditionalGeneration with a ColBERT-style + linear projection layer for per-token embeddings. It supports: + - "token_embed" task: Per-token embeddings for late interaction scoring + + The model produces L2-normalized per-token embeddings by: + 1. Running the Qwen3-VL backbone (vision + language) to get hidden states + 2. Projecting hidden states through a linear layer (hidden_size -> embed_dim) + 3. L2-normalizing the projected embeddings + + ColBERT-style MaxSim scoring is computed externally, either client-side + or via the late interaction scoring path in ServingScores. + + Attributes: + custom_text_proj: Linear projection from hidden_size to embed_dim + supports_late_interaction: Flag indicating this model uses late + interaction scoring + """ + + # Mark this as a pooling model so vLLM routes to pooler path + is_pooling_model = True + + # Mark this model as supporting late interaction scoring + supports_late_interaction: ClassVar[Literal[True]] = True + + # Override hf_to_vllm_mapper to handle ColQwen3 weight naming. + # NOTE: WeightsMapper applies ALL matching prefix rules sequentially + # (no early exit), so more-specific prefixes must come first. + # TomoroAI: "vlm.model.visual.", "vlm.model.language_model." + # ColPali: "model.visual.", "model.language_model." + # OpenSearch: "visual.", "language_model." (no outer prefix, + # re-prefixed to "model.*" in load_weights) + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # TomoroAI naming convention (most specific first) + "vlm.model.visual.": "visual.", + "vlm.lm_head.": "language_model.lm_head.", + "vlm.model.language_model.": "language_model.model.", + # ColPali / nvidia naming convention + "model.visual.": "visual.", + "lm_head.": "language_model.lm_head.", + # OpenSearch-AI: after re-prefix, "language_model.model.*" + # becomes "model.language_model.model.*" — handle this before + # the shorter "model.language_model." rule to avoid double map + "model.language_model.model.": "language_model.model.", + "model.language_model.": "language_model.model.", + } + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + config = vllm_config.model_config.hf_config + head_dtype = vllm_config.model_config.head_dtype + + hidden_size = getattr(config, "hidden_size", None) + if hidden_size is None and hasattr(config, "text_config"): + hidden_size = config.text_config.hidden_size + if hidden_size is None: + raise ValueError( + "Unable to determine text hidden size from config. " + "Expected 'hidden_size' or 'text_config.hidden_size'." + ) + self._proj_hidden_size = hidden_size + + # (TomoroAI: embed_dim, OpenSearch: dims, ColPali: dim) + self.embed_dim: int | None = ( + getattr(config, "embed_dim", None) + or getattr(config, "dims", None) + or getattr(config, "dim", None) + or getattr(config, "projection_dim", None) + or getattr(config, "colbert_dim", None) + ) + + # Build the projection layer if embed_dim is known + if self.embed_dim is not None: + self.custom_text_proj = nn.Linear( + hidden_size, + self.embed_dim, + bias=False, + dtype=head_dtype, + ) + else: + # Will be created during load_weights when dim is inferred + self.custom_text_proj = None + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + self.pooler = pooler_for_token_embed( + pooler_config, + projector=None, + ) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors=None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor: + """Run forward pass producing per-token embeddings.""" + hidden_states = super().forward( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) + + if not isinstance(hidden_states, torch.Tensor): + return hidden_states # type: ignore + + proj_dtype = self.custom_text_proj.weight.dtype # type: ignore + if hidden_states.dtype != proj_dtype: + hidden_states = hidden_states.to(proj_dtype) + + # Project to embedding dimension and L2 normalize + proj = self.custom_text_proj(hidden_states) # type: ignore + return torch.nn.functional.normalize(proj, p=2, dim=-1) + + # Names used for the projection layer across different ColQwen3 variants + _PROJ_LAYER_NAMES = { + "custom_text_proj", # ColPali naming + "embedding_proj_layer", # TomoroAI naming + } + + def _is_proj_weight(self, name: str) -> bool: + """Check if a weight name belongs to the projection layer.""" + return any(proj_name in name for proj_name in self._PROJ_LAYER_NAMES) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights with special handling for ColQwen3 projection layer.""" + weights_list = list(weights) + proj_weights: list[tuple[str, torch.Tensor]] = [] + model_weights: list[tuple[str, torch.Tensor]] = [] + + # Scan all weight names to determine if re-prefixing is needed. + # OpenSearch-AI models have unprefixed weights ("language_model.*", + # "visual.*") that need "model." added so hf_to_vllm_mapper can + # process them. Only re-prefix if ALL backbone weights are + # unprefixed (no "vlm." or "model." prefix found). + has_unprefixed = any( + name.startswith("language_model.") or name.startswith("visual.") + for name, _ in weights_list + ) + has_prefixed = any( + name.startswith("vlm.") or name.startswith("model.") + for name, _ in weights_list + ) + needs_reprefix = has_unprefixed and not has_prefixed + + for name, weight in weights_list: + if self._is_proj_weight(name): + proj_weights.append((name, weight)) + else: + if needs_reprefix and not self._is_proj_weight(name): + name = "model." + name + model_weights.append((name, weight)) + + loader = AutoWeightsLoader(self) + loaded = loader.load_weights(model_weights, mapper=self.hf_to_vllm_mapper) + + if proj_weights: + model_dtype = next(self.language_model.parameters()).dtype + model_device = next(self.language_model.parameters()).device + + for name, weight in proj_weights: + if self.embed_dim is None and "weight" in name: + self.embed_dim = weight.shape[0] + has_bias = any("bias" in n for n, _ in proj_weights) + self.custom_text_proj = nn.Linear( + self._proj_hidden_size, + self.embed_dim, + bias=has_bias, + dtype=model_dtype, + ) + self.custom_text_proj.to(model_device) + + if self.custom_text_proj is not None: + param_name = name.split(".")[-1] + param = getattr(self.custom_text_proj, param_name, None) + if param is not None: + weight = weight.to(device=param.device, dtype=param.dtype) + default_weight_loader(param, weight) + loaded.add(f"custom_text_proj.{param_name}") + + return loaded diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2ae22ea63..7e8d051a8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -254,6 +254,8 @@ _EMBEDDING_MODELS = { ), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 + "ColQwen3": ("colqwen3", "ColQwen3Model"), + "OpsColQwen3Model": ("colqwen3", "ColQwen3Model"), "SiglipModel": ("siglip", "SiglipEmbeddingModel"), # Technically Terratorch models work on images, both in # input and output. I am adding it here because it piggy-backs on embedding diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index b930eec06..ece5614fc 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -74,6 +74,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( afmoe="AfmoeConfig", bagel="BagelConfig", chatglm="ChatGLMConfig", + colqwen3="ColQwen3Config", + ops_colqwen3="OpsColQwen3Config", deepseek_vl_v2="DeepseekVLV2Config", deepseek_v32="DeepseekV3Config", flex_olmo="FlexOlmoConfig", diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 0fcadf826..d02ab01d7 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -18,6 +18,9 @@ _CLASS_TO_MODULE: dict[str, str] = { "AfmoeConfig": "vllm.transformers_utils.configs.afmoe", "BagelConfig": "vllm.transformers_utils.configs.bagel", "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm", + "ColQwen3Config": "vllm.transformers_utils.configs.colqwen3", + "OpsColQwen3Config": "vllm.transformers_utils.configs.colqwen3", + "Qwen3VLNemotronEmbedConfig": "vllm.transformers_utils.configs.colqwen3", "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2", "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr", "EAGLEConfig": "vllm.transformers_utils.configs.eagle", @@ -68,6 +71,9 @@ __all__ = [ "AfmoeConfig", "BagelConfig", "ChatGLMConfig", + "ColQwen3Config", + "OpsColQwen3Config", + "Qwen3VLNemotronEmbedConfig", "DeepseekVLV2Config", "DeepseekV3Config", "DotsOCRConfig", diff --git a/vllm/transformers_utils/configs/colqwen3.py b/vllm/transformers_utils/configs/colqwen3.py new file mode 100644 index 000000000..1c09a0a91 --- /dev/null +++ b/vllm/transformers_utils/configs/colqwen3.py @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +ColQwen3 configuration that extends Qwen3VLConfig with embedding projection +fields. This allows ColQwen3 models to be loaded without trust_remote_code +by mapping their custom model_type (colqwen3, ops_colqwen3, etc.) to a +standard config class that vLLM understands. + +Supported model_types: +- colqwen3 (TomoroAI/tomoro-colqwen3-embed-8b) +- ops_colqwen3 (OpenSearch-AI/Ops-Colqwen3-4B) +- qwen3_vl_nemotron_embed (nvidia/nemotron-colembed-vl-8b-v2) +""" + +from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig + + +class ColQwen3Config(Qwen3VLConfig): + """Configuration class for ColQwen3 models. + + Extends Qwen3VLConfig with additional fields used by ColQwen3 variants + for the embedding projection layer. + """ + + # Accept any ColQwen3 variant model_type + model_type = "colqwen3" + + def __init__( + self, + embed_dim: int | None = None, + dims: int | None = None, + dim: int | None = None, + projection_dim: int | None = None, + colbert_dim: int | None = None, + pooling: str | None = None, + **kwargs, + ): + # Store embedding projection config fields + self.embed_dim = embed_dim + self.dims = dims + self.dim = dim + self.projection_dim = projection_dim + self.colbert_dim = colbert_dim + self.pooling = pooling + + super().__init__(**kwargs) + + +class OpsColQwen3Config(ColQwen3Config): + """Configuration for OpenSearch-AI ColQwen3 variants.""" + + model_type = "ops_colqwen3" + + +class Qwen3VLNemotronEmbedConfig(ColQwen3Config): + """Configuration for NVIDIA Nemotron ColEmbed variants.""" + + model_type = "qwen3_vl_nemotron_embed"