[new model] add COLQwen3 code & Inference (#34398)

Signed-off-by: craftsangjae <craftsangjae@gmail.com> Signed-off-by: katacoder <craftsangjae@gmail.com>
2026-02-14 13:15:19 +09:00
parent de42abb366
commit d1ea65d0a1
10 changed files with 935 additions and 0 deletions
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -374,6 +374,77 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{

 An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py)

+### ColQwen3 Multi-Modal Late Interaction Models
+
+ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone.
+
+| Architecture | Backbone | Example HF Models |
+|---|---|---|
+| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` |
+| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` |
+
+Start the server:
+
+```shell
+vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
+```
+
+Then you can use the rerank endpoint:
+
+```shell
+curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "query": "What is machine learning?",
+    "documents": [
+        "Machine learning is a subset of artificial intelligence.",
+        "Python is a programming language.",
+        "Deep learning uses neural networks."
+    ]
+}'
+```
+
+Or the score endpoint:
+
+```shell
+curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "text_1": "What is the capital of France?",
+    "text_2": ["The capital of France is Paris.", "Python is a programming language."]
+}'
+```
+
+You can also get the raw token embeddings using the pooling endpoint with `token_embed` task:
+
+```shell
+curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "input": "What is machine learning?",
+    "task": "token_embed"
+}'
+```
+
+For **image inputs**, use the chat-style `messages` field so that the vLLM multimodal processor handles them correctly:
+
+```shell
+curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
+    "model": "TomoroAI/tomoro-colqwen3-embed-4b",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
+                {"type": "text", "text": "Describe the image."}
+            ]
+        }
+    ]
+}'
+```
+
+Examples can be found here:
+
+- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
+- Reranking: [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
+
 ### BAAI/bge-m3

 The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
--- a/examples/pooling/score/colqwen3_rerank_online.py
+++ b/examples/pooling/score/colqwen3_rerank_online.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example of using ColQwen3 late interaction model for reranking.
+
+ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL.
+It produces per-token embeddings and uses MaxSim scoring for retrieval
+and reranking. Supports both text and image inputs.
+
+Start the server with:
+    vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 50000
+
+Then run this script:
+    python colqwen3_rerank_online.py
+"""
+
+import requests
+
+MODEL = "TomoroAI/tomoro-colqwen3-embed-4b"
+BASE_URL = "http://127.0.0.1:8000"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+
+def rerank_text():
+    """Text-only reranking via /rerank endpoint."""
+    print("=" * 60)
+    print("1. Text reranking (/rerank)")
+    print("=" * 60)
+
+    data = {
+        "model": MODEL,
+        "query": "What is machine learning?",
+        "documents": [
+            "Machine learning is a subset of artificial intelligence.",
+            "Python is a programming language.",
+            "Deep learning uses neural networks for complex tasks.",
+            "The weather today is sunny.",
+        ],
+    }
+
+    response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
+
+    if response.status_code == 200:
+        result = response.json()
+        print("\n  Ranked documents (most relevant first):")
+        for item in result["results"]:
+            doc_idx = item["index"]
+            score = item["relevance_score"]
+            print(f"    [{score:.4f}] {data['documents'][doc_idx]}")
+    else:
+        print(f"  Request failed: {response.status_code}")
+        print(f"  {response.text[:300]}")
+
+
+def score_text():
+    """Text-only scoring via /score endpoint."""
+    print()
+    print("=" * 60)
+    print("2. Text scoring (/score)")
+    print("=" * 60)
+
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of France is Paris.",
+        "Berlin is the capital of Germany.",
+        "Python is a programming language.",
+    ]
+
+    data = {
+        "model": MODEL,
+        "text_1": query,
+        "text_2": documents,
+    }
+
+    response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)
+
+    if response.status_code == 200:
+        result = response.json()
+        print(f"\n  Query: {query}\n")
+        for item in result["data"]:
+            idx = item["index"]
+            score = item["score"]
+            print(f"    Doc {idx} (score={score:.4f}): {documents[idx]}")
+    else:
+        print(f"  Request failed: {response.status_code}")
+        print(f"  {response.text[:300]}")
+
+
+def score_text_top_n():
+    """Text reranking with top_n filtering via /rerank endpoint."""
+    print()
+    print("=" * 60)
+    print("3. Text reranking with top_n=2 (/rerank)")
+    print("=" * 60)
+
+    data = {
+        "model": MODEL,
+        "query": "What is the capital of France?",
+        "documents": [
+            "The capital of France is Paris.",
+            "Berlin is the capital of Germany.",
+            "Python is a programming language.",
+            "The Eiffel Tower is in Paris.",
+        ],
+        "top_n": 2,
+    }
+
+    response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
+
+    if response.status_code == 200:
+        result = response.json()
+        print(f"\n  Top {data['top_n']} results:")
+        for item in result["results"]:
+            doc_idx = item["index"]
+            score = item["relevance_score"]
+            print(f"    [{score:.4f}] {data['documents'][doc_idx]}")
+    else:
+        print(f"  Request failed: {response.status_code}")
+        print(f"  {response.text[:300]}")
+
+
+def main():
+    rerank_text()
+    score_text()
+    score_text_top_n()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pooling/token_embed/colqwen3_token_embed_online.py
+++ b/examples/pooling/token_embed/colqwen3_token_embed_online.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+"""
+Example online usage of Pooling API for ColQwen3 multi-vector retrieval.
+
+ColQwen3 is a multi-modal late interaction model based on Qwen3-VL that
+produces per-token embeddings (320-dim, L2-normalized) for both text and
+image inputs. Similarity is computed via MaxSim scoring.
+
+This example mirrors the official TomoroAI inference code
+(https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-4b) but uses the
+vLLM serving API instead of local HuggingFace model loading.
+
+Start the server with:
+    vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
+
+Then run this script:
+    python colqwen3_token_embed_online.py
+"""
+
+import argparse
+import base64
+from io import BytesIO
+
+import numpy as np
+import requests
+from PIL import Image
+
+# ── Helpers ─────────────────────────────────────────────────
+
+
+def post_http_request(payload: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    return requests.post(api_url, headers=headers, json=payload)
+
+
+def load_image(url: str) -> Image.Image:
+    """Download an image from URL (handles Wikimedia 403)."""
+    for hdrs in ({}, {"User-Agent": "Mozilla/5.0 (compatible; ColQwen3-demo/1.0)"}):
+        resp = requests.get(url, headers=hdrs, timeout=10)
+        if resp.status_code == 403:
+            continue
+        resp.raise_for_status()
+        return Image.open(BytesIO(resp.content)).convert("RGB")
+    raise RuntimeError(f"Could not fetch image from {url}")
+
+
+def encode_image_base64(image: Image.Image) -> str:
+    """Encode a PIL image to a base64 data URI."""
+    buf = BytesIO()
+    image.save(buf, format="PNG")
+    return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()
+
+
+def compute_maxsim(q_emb: np.ndarray, d_emb: np.ndarray) -> float:
+    """Compute ColBERT-style MaxSim score between query and document."""
+    sim = q_emb @ d_emb.T
+    return float(sim.max(axis=-1).sum())
+
+
+# ── Encode functions ────────────────────────────────────────
+
+
+def encode_queries(texts: list[str], model: str, api_url: str) -> list[np.ndarray]:
+    """Encode text queries → list of multi-vector embeddings."""
+    resp = post_http_request({"model": model, "input": texts}, api_url)
+    return [np.array(item["data"]) for item in resp.json()["data"]]
+
+
+def encode_images(image_urls: list[str], model: str, api_url: str) -> list[np.ndarray]:
+    """Encode image documents → list of multi-vector embeddings.
+
+    Images are sent via the chat-style `messages` field so that the
+    vLLM multimodal processor handles them correctly.
+    """
+    embeddings = []
+    for url in image_urls:
+        print(f"  Loading: {url.split('/')[-1]}...")
+        image = load_image(url)
+        image_uri = encode_image_base64(image)
+        resp = post_http_request(
+            {
+                "model": model,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": image_uri}},
+                            {"type": "text", "text": "Describe the image."},
+                        ],
+                    }
+                ],
+            },
+            api_url,
+        )
+        result = resp.json()
+        if resp.status_code != 200 or "data" not in result:
+            print(f"    Error ({resp.status_code}): {str(result)[:200]}")
+            continue
+        embeddings.append(np.array(result["data"][0]["data"]))
+    return embeddings
+
+
+# ── Main ────────────────────────────────────────────────────
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="TomoroAI/tomoro-colqwen3-embed-4b",
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    pooling_url = f"http://{args.host}:{args.port}/pooling"
+    score_url = f"http://{args.host}:{args.port}/score"
+    model = args.model
+
+    # Same sample data as the official TomoroAI example
+    queries = [
+        "Retrieve the city of Singapore",
+        "Retrieve the city of Beijing",
+        "Retrieve the city of London",
+    ]
+    image_urls = [
+        "https://upload.wikimedia.org/wikipedia/commons/2/27/Singapore_skyline_2022.jpg",
+        "https://upload.wikimedia.org/wikipedia/commons/6/61/Beijing_skyline_at_night.JPG",
+        "https://upload.wikimedia.org/wikipedia/commons/4/49/London_skyline.jpg",
+    ]
+
+    # ── 1) Text query embeddings ────────────────────────────
+    print("=" * 60)
+    print("1. Encode text queries (multi-vector)")
+    print("=" * 60)
+    query_embeddings = encode_queries(queries, model, pooling_url)
+    for i, emb in enumerate(query_embeddings):
+        norm = float(np.linalg.norm(emb[0]))
+        print(f'  Query {i}: {emb.shape}  (L2 norm: {norm:.4f})  "{queries[i]}"')
+
+    # ── 2) Image document embeddings ────────────────────────
+    print()
+    print("=" * 60)
+    print("2. Encode image documents (multi-vector)")
+    print("=" * 60)
+    doc_embeddings = encode_images(image_urls, model, pooling_url)
+    for i, emb in enumerate(doc_embeddings):
+        print(f"  Doc {i}:   {emb.shape}  {image_urls[i].split('/')[-1]}")
+
+    # ── 3) Cross-modal MaxSim scoring ───────────────────────
+    if doc_embeddings:
+        print()
+        print("=" * 60)
+        print("3. Cross-modal MaxSim scores (text queries × image docs)")
+        print("=" * 60)
+        # Header
+        print(f"{'':>35s}", end="")
+        for j in range(len(doc_embeddings)):
+            print(f"  Doc {j:>2d}", end="")
+        print()
+        # Score matrix
+        for i, q_emb in enumerate(query_embeddings):
+            print(f"  {queries[i]:<33s}", end="")
+            for j, d_emb in enumerate(doc_embeddings):
+                score = compute_maxsim(q_emb, d_emb)
+                print(f"  {score:6.2f}", end="")
+            print()
+
+    # ── 4) Text-only /score endpoint ────────────────────────
+    print()
+    print("=" * 60)
+    print("4. Text-only late interaction scoring (/score endpoint)")
+    print("=" * 60)
+    text_query = "What is the capital of France?"
+    text_docs = [
+        "The capital of France is Paris.",
+        "Berlin is the capital of Germany.",
+        "Python is a programming language.",
+    ]
+    resp = post_http_request(
+        {"model": model, "text_1": text_query, "text_2": text_docs},
+        score_url,
+    )
+    print(f'  Query: "{text_query}"\n')
+    for item in resp.json()["data"]:
+        idx = item["index"]
+        print(f"  Doc {idx} (score={item['score']:.4f}): {text_docs[idx]}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/tests/models/multimodal/pooling/test_colqwen3.py
+++ b/tests/models/multimodal/pooling/test_colqwen3.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for ColQwen3 late interaction model for multi-modal retrieval.
+
+ColQwen3 is a multi-vector retrieval model based on Qwen3-VL backbone with
+ColBERT-style late interaction scoring (MaxSim). It produces per-token
+embeddings for both text and image inputs.
+"""
+
+import pytest
+import torch
+
+from ....conftest import VllmRunner
+
+MODELS = [
+    "TomoroAI/tomoro-colqwen3-embed-4b",
+    "OpenSearch-AI/Ops-Colqwen3-4B",
+]
+
+EMBED_DIMS = {
+    "TomoroAI/tomoro-colqwen3-embed-4b": 320,
+    "OpenSearch-AI/Ops-Colqwen3-4B": 2560,
+}
+
+TEXT_QUERIES = [
+    "What is the capital of France?",
+    "Describe the contents of the document.",
+]
+
+TEXT_DOCUMENTS = [
+    "The capital of France is Paris.",
+    "This document contains important financial data.",
+]
+
+DTYPE = "half"
+
+
+def _run_token_embed_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Verify per-token embedding shape and L2 normalization."""
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+    ) as vllm_model:
+        outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
+
+        assert len(outputs) == 1
+        emb = torch.tensor(outputs[0])
+        # Token embeddings should be 2D: [num_tokens, embed_dim]
+        assert emb.dim() == 2
+        assert emb.shape[1] == EMBED_DIMS[model]
+        assert emb.shape[0] > 1
+
+        # Verify L2 normalization
+        norms = torch.norm(emb, p=2, dim=-1)
+        torch.testing.assert_close(
+            norms,
+            torch.ones_like(norms),
+            rtol=1e-2,
+            atol=1e-2,
+        )
+
+
+def _run_late_interaction_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Verify MaxSim scoring matches manual computation."""
+    from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+    ) as vllm_model:
+        q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
+        d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]])
+
+        q_emb = torch.tensor(q_outputs[0])
+        d_emb = torch.tensor(d_outputs[0])
+
+        manual_score = compute_maxsim_score(q_emb, d_emb).item()
+
+        vllm_scores = vllm_model.score(TEXT_QUERIES[0], TEXT_DOCUMENTS[0])
+
+        assert len(vllm_scores) == 1
+        assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01)
+
+
+def _run_relevance_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    """Verify that relevant documents score higher than irrelevant ones."""
+    query = "What is machine learning?"
+    documents = [
+        "Machine learning is a subset of artificial intelligence.",
+        "The weather forecast shows rain tomorrow.",
+        "Deep learning uses neural networks for complex tasks.",
+    ]
+
+    with vllm_runner(
+        model,
+        runner="pooling",
+        dtype=dtype,
+        max_model_len=4096,
+        enforce_eager=True,
+    ) as vllm_model:
+        scores = vllm_model.score(query, documents)
+
+        assert len(scores) == 3
+        assert scores[0] > scores[1], "ML doc should score higher than weather doc"
+        assert scores[2] > scores[1], "DL doc should score higher than weather doc"
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_token_embed(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_token_embed_test(vllm_runner, model, dtype=dtype)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_late_interaction_scoring(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_late_interaction_test(vllm_runner, model, dtype=dtype)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", [DTYPE])
+def test_colqwen3_relevance_ordering(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    _run_relevance_test(vllm_runner, model, dtype=dtype)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -597,6 +597,12 @@ _EMBEDDING_EXAMPLE_MODELS = {
        "TIGER-Lab/VLM2Vec-Full", trust_remote_code=True
    ),
    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"),
+    "ColQwen3": _HfExamplesInfo(
+        "TomoroAI/tomoro-colqwen3-embed-4b", trust_remote_code=True
+    ),
+    "OpsColQwen3Model": _HfExamplesInfo(
+        "OpenSearch-AI/Ops-Colqwen3-4B", trust_remote_code=True
+    ),
    "SiglipModel": _HfExamplesInfo("google/siglip-base-patch16-224"),
    "PrithviGeoSpatialMAE": _HfExamplesInfo(
        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
--- a/vllm/model_executor/models/colqwen3.py
+++ b/vllm/model_executor/models/colqwen3.py
@@ -0,0 +1,306 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+ColQwen3 late interaction model for multi-modal retrieval and reranking.
+
+ColQwen3 extends Qwen3-VL with a ColBERT-style late interaction head,
+producing per-token embeddings for both text and image inputs. It uses
+MaxSim scoring for retrieval/reranking tasks.
+
+This model supports the "token_embed" pooling task and is designed for
+multi-vector retrieval of documents containing both text and images.
+
+Reference: https://arxiv.org/abs/2407.01449 (ColPali)
+Based on: Qwen3-VL backbone with custom text projection
+
+Target models:
+- TomoroAI/tomoro-colqwen3-embed-8b
+- OpenSearch-AI/Ops-Colqwen3-4B
+"""
+
+from collections.abc import Iterable, Mapping
+from typing import ClassVar, Literal
+
+import torch
+import torch.nn as nn
+from transformers.models.qwen3_vl import Qwen3VLProcessor
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from .interfaces_base import default_pooling_type
+from .qwen2_vl import Qwen2VLMultiModalDataParser
+from .qwen3_vl import (
+    Qwen3VLDummyInputsBuilder,
+    Qwen3VLForConditionalGeneration,
+    Qwen3VLMultiModalProcessor,
+    Qwen3VLProcessingInfo,
+)
+from .utils import AutoWeightsLoader, WeightsMapper
+
+
+class ColQwen3ProcessingInfo(Qwen3VLProcessingInfo):
+    """Processing info for ColQwen3 models.
+
+    ColQwen3 models (TomoroAI, OpenSearch-AI, etc.) use custom HuggingFace
+    configs (e.g. ColQwen3Config, OpsColQwen3Config) that are not instances
+    of Qwen3VLConfig. We override get_hf_config() and get_hf_processor()
+    to skip the strict type check, similar to OpenCUAProcessingInfo.
+    """
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor:
+        # Force standard Qwen3VLProcessor even when trust_remote_code=True.
+        # ColQwen3 custom processors (e.g. ColQwen3Processor) have
+        # incompatible interfaces with vLLM's Qwen3VLMultiModalProcessor.
+        # The standard Qwen3VLProcessor handles both text and image inputs
+        # correctly for the Qwen3-VL backbone.
+        return self.ctx.get_hf_processor(
+            Qwen3VLProcessor,
+            use_fast=kwargs.pop("use_fast", True),
+            **kwargs,
+        )
+
+    @property
+    def _supports_video(self) -> bool:
+        """Check if the HF processor supports video inputs."""
+        return hasattr(self.get_hf_processor(), "video_processor")
+
+    def get_video_processor(self, **kwargs: object):
+        if not self._supports_video:
+            raise AttributeError(
+                f"The processor for {self.ctx.model_config.model} does not "
+                "support video inputs (no video_processor attribute)."
+            )
+        return self.get_hf_processor(**kwargs).video_processor  # type: ignore[attr-defined]
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        limits: dict[str, int | None] = {"image": None}
+        if self._supports_video:
+            limits["video"] = None
+        return limits
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        result: dict[str, int] = {"image": max_image_tokens}
+        if self._supports_video:
+            max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
+            result["video"] = max_video_tokens
+        return result
+
+    def get_data_parser(self):
+        hf_config = self.get_hf_config()
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        return Qwen2VLMultiModalDataParser(
+            spatial_merge_size,
+            video_needs_metadata=self._supports_video,
+            expected_hidden_size=self._get_expected_hidden_size(),
+        )
+
+
+@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen3VLMultiModalProcessor,
+    info=ColQwen3ProcessingInfo,
+    dummy_inputs=Qwen3VLDummyInputsBuilder,
+)
+class ColQwen3Model(
+    Qwen3VLForConditionalGeneration,
+):
+    """ColQwen3 late interaction model for multi-modal retrieval/reranking.
+
+    This model extends Qwen3VLForConditionalGeneration with a ColBERT-style
+    linear projection layer for per-token embeddings. It supports:
+    - "token_embed" task: Per-token embeddings for late interaction scoring
+
+    The model produces L2-normalized per-token embeddings by:
+    1. Running the Qwen3-VL backbone (vision + language) to get hidden states
+    2. Projecting hidden states through a linear layer (hidden_size -> embed_dim)
+    3. L2-normalizing the projected embeddings
+
+    ColBERT-style MaxSim scoring is computed externally, either client-side
+    or via the late interaction scoring path in ServingScores.
+
+    Attributes:
+        custom_text_proj: Linear projection from hidden_size to embed_dim
+        supports_late_interaction: Flag indicating this model uses late
+            interaction scoring
+    """
+
+    # Mark this as a pooling model so vLLM routes to pooler path
+    is_pooling_model = True
+
+    # Mark this model as supporting late interaction scoring
+    supports_late_interaction: ClassVar[Literal[True]] = True
+
+    # Override hf_to_vllm_mapper to handle ColQwen3 weight naming.
+    # NOTE: WeightsMapper applies ALL matching prefix rules sequentially
+    # (no early exit), so more-specific prefixes must come first.
+    #   TomoroAI:    "vlm.model.visual.", "vlm.model.language_model."
+    #   ColPali:     "model.visual.", "model.language_model."
+    #   OpenSearch:  "visual.", "language_model." (no outer prefix,
+    #                re-prefixed to "model.*" in load_weights)
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # TomoroAI naming convention (most specific first)
+            "vlm.model.visual.": "visual.",
+            "vlm.lm_head.": "language_model.lm_head.",
+            "vlm.model.language_model.": "language_model.model.",
+            # ColPali / nvidia naming convention
+            "model.visual.": "visual.",
+            "lm_head.": "language_model.lm_head.",
+            # OpenSearch-AI: after re-prefix, "language_model.model.*"
+            # becomes "model.language_model.model.*" — handle this before
+            # the shorter "model.language_model." rule to avoid double map
+            "model.language_model.model.": "language_model.model.",
+            "model.language_model.": "language_model.model.",
+        }
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        config = vllm_config.model_config.hf_config
+        head_dtype = vllm_config.model_config.head_dtype
+
+        hidden_size = getattr(config, "hidden_size", None)
+        if hidden_size is None and hasattr(config, "text_config"):
+            hidden_size = config.text_config.hidden_size
+        if hidden_size is None:
+            raise ValueError(
+                "Unable to determine text hidden size from config. "
+                "Expected 'hidden_size' or 'text_config.hidden_size'."
+            )
+        self._proj_hidden_size = hidden_size
+
+        # (TomoroAI: embed_dim, OpenSearch: dims, ColPali: dim)
+        self.embed_dim: int | None = (
+            getattr(config, "embed_dim", None)
+            or getattr(config, "dims", None)
+            or getattr(config, "dim", None)
+            or getattr(config, "projection_dim", None)
+            or getattr(config, "colbert_dim", None)
+        )
+
+        # Build the projection layer if embed_dim is known
+        if self.embed_dim is not None:
+            self.custom_text_proj = nn.Linear(
+                hidden_size,
+                self.embed_dim,
+                bias=False,
+                dtype=head_dtype,
+            )
+        else:
+            # Will be created during load_weights when dim is inferred
+            self.custom_text_proj = None
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler = pooler_for_token_embed(
+            pooler_config,
+            projector=None,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors=None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        """Run forward pass producing per-token embeddings."""
+        hidden_states = super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+        if not isinstance(hidden_states, torch.Tensor):
+            return hidden_states  # type: ignore
+
+        proj_dtype = self.custom_text_proj.weight.dtype  # type: ignore
+        if hidden_states.dtype != proj_dtype:
+            hidden_states = hidden_states.to(proj_dtype)
+
+        # Project to embedding dimension and L2 normalize
+        proj = self.custom_text_proj(hidden_states)  # type: ignore
+        return torch.nn.functional.normalize(proj, p=2, dim=-1)
+
+    # Names used for the projection layer across different ColQwen3 variants
+    _PROJ_LAYER_NAMES = {
+        "custom_text_proj",  # ColPali naming
+        "embedding_proj_layer",  # TomoroAI naming
+    }
+
+    def _is_proj_weight(self, name: str) -> bool:
+        """Check if a weight name belongs to the projection layer."""
+        return any(proj_name in name for proj_name in self._PROJ_LAYER_NAMES)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Load weights with special handling for ColQwen3 projection layer."""
+        weights_list = list(weights)
+        proj_weights: list[tuple[str, torch.Tensor]] = []
+        model_weights: list[tuple[str, torch.Tensor]] = []
+
+        # Scan all weight names to determine if re-prefixing is needed.
+        # OpenSearch-AI models have unprefixed weights ("language_model.*",
+        # "visual.*") that need "model." added so hf_to_vllm_mapper can
+        # process them. Only re-prefix if ALL backbone weights are
+        # unprefixed (no "vlm." or "model." prefix found).
+        has_unprefixed = any(
+            name.startswith("language_model.") or name.startswith("visual.")
+            for name, _ in weights_list
+        )
+        has_prefixed = any(
+            name.startswith("vlm.") or name.startswith("model.")
+            for name, _ in weights_list
+        )
+        needs_reprefix = has_unprefixed and not has_prefixed
+
+        for name, weight in weights_list:
+            if self._is_proj_weight(name):
+                proj_weights.append((name, weight))
+            else:
+                if needs_reprefix and not self._is_proj_weight(name):
+                    name = "model." + name
+                model_weights.append((name, weight))
+
+        loader = AutoWeightsLoader(self)
+        loaded = loader.load_weights(model_weights, mapper=self.hf_to_vllm_mapper)
+
+        if proj_weights:
+            model_dtype = next(self.language_model.parameters()).dtype
+            model_device = next(self.language_model.parameters()).device
+
+            for name, weight in proj_weights:
+                if self.embed_dim is None and "weight" in name:
+                    self.embed_dim = weight.shape[0]
+                    has_bias = any("bias" in n for n, _ in proj_weights)
+                    self.custom_text_proj = nn.Linear(
+                        self._proj_hidden_size,
+                        self.embed_dim,
+                        bias=has_bias,
+                        dtype=model_dtype,
+                    )
+                    self.custom_text_proj.to(model_device)
+
+                if self.custom_text_proj is not None:
+                    param_name = name.split(".")[-1]
+                    param = getattr(self.custom_text_proj, param_name, None)
+                    if param is not None:
+                        weight = weight.to(device=param.device, dtype=param.dtype)
+                        default_weight_loader(param, weight)
+                        loaded.add(f"custom_text_proj.{param_name}")
+
+        return loaded
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -254,6 +254,8 @@ _EMBEDDING_MODELS = {
    ),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "ColQwen3": ("colqwen3", "ColQwen3Model"),
+    "OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
    "SiglipModel": ("siglip", "SiglipEmbeddingModel"),
    # Technically Terratorch models work on images, both in
    # input and output. I am adding it here because it piggy-backs on embedding
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -74,6 +74,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
    afmoe="AfmoeConfig",
    bagel="BagelConfig",
    chatglm="ChatGLMConfig",
+    colqwen3="ColQwen3Config",
+    ops_colqwen3="OpsColQwen3Config",
    deepseek_vl_v2="DeepseekVLV2Config",
    deepseek_v32="DeepseekV3Config",
    flex_olmo="FlexOlmoConfig",
--- a/vllm/transformers_utils/configs/init.py
+++ b/vllm/transformers_utils/configs/init.py
@@ -18,6 +18,9 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
    "BagelConfig": "vllm.transformers_utils.configs.bagel",
    "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
+    "ColQwen3Config": "vllm.transformers_utils.configs.colqwen3",
+    "OpsColQwen3Config": "vllm.transformers_utils.configs.colqwen3",
+    "Qwen3VLNemotronEmbedConfig": "vllm.transformers_utils.configs.colqwen3",
    "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
    "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
    "EAGLEConfig": "vllm.transformers_utils.configs.eagle",
@@ -68,6 +71,9 @@ __all__ = [
    "AfmoeConfig",
    "BagelConfig",
    "ChatGLMConfig",
+    "ColQwen3Config",
+    "OpsColQwen3Config",
+    "Qwen3VLNemotronEmbedConfig",
    "DeepseekVLV2Config",
    "DeepseekV3Config",
    "DotsOCRConfig",
--- a/vllm/transformers_utils/configs/colqwen3.py
+++ b/vllm/transformers_utils/configs/colqwen3.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+ColQwen3 configuration that extends Qwen3VLConfig with embedding projection
+fields. This allows ColQwen3 models to be loaded without trust_remote_code
+by mapping their custom model_type (colqwen3, ops_colqwen3, etc.) to a
+standard config class that vLLM understands.
+
+Supported model_types:
+- colqwen3 (TomoroAI/tomoro-colqwen3-embed-8b)
+- ops_colqwen3 (OpenSearch-AI/Ops-Colqwen3-4B)
+- qwen3_vl_nemotron_embed (nvidia/nemotron-colembed-vl-8b-v2)
+"""
+
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+
+
+class ColQwen3Config(Qwen3VLConfig):
+    """Configuration class for ColQwen3 models.
+
+    Extends Qwen3VLConfig with additional fields used by ColQwen3 variants
+    for the embedding projection layer.
+    """
+
+    # Accept any ColQwen3 variant model_type
+    model_type = "colqwen3"
+
+    def __init__(
+        self,
+        embed_dim: int | None = None,
+        dims: int | None = None,
+        dim: int | None = None,
+        projection_dim: int | None = None,
+        colbert_dim: int | None = None,
+        pooling: str | None = None,
+        **kwargs,
+    ):
+        # Store embedding projection config fields
+        self.embed_dim = embed_dim
+        self.dims = dims
+        self.dim = dim
+        self.projection_dim = projection_dim
+        self.colbert_dim = colbert_dim
+        self.pooling = pooling
+
+        super().__init__(**kwargs)
+
+
+class OpsColQwen3Config(ColQwen3Config):
+    """Configuration for OpenSearch-AI ColQwen3 variants."""
+
+    model_type = "ops_colqwen3"
+
+
+class Qwen3VLNemotronEmbedConfig(ColQwen3Config):
+    """Configuration for NVIDIA Nemotron ColEmbed variants."""
+
+    model_type = "qwen3_vl_nemotron_embed"