[new model] add COLQwen3 code & Inference (#34398)
Signed-off-by: craftsangjae <craftsangjae@gmail.com> Signed-off-by: katacoder <craftsangjae@gmail.com>
This commit is contained in:
@@ -374,6 +374,77 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
|
||||
An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py)
|
||||
|
||||
### ColQwen3 Multi-Modal Late Interaction Models
|
||||
|
||||
ColQwen3 is based on [ColPali](https://arxiv.org/abs/2407.01449), which extends ColBERT's late interaction approach to **multi-modal** inputs. While ColBERT operates on text-only token embeddings, ColPali/ColQwen3 can embed both **text and images** (e.g. PDF pages, screenshots, diagrams) into per-token L2-normalized vectors and compute relevance via MaxSim scoring. ColQwen3 specifically uses Qwen3-VL as its vision-language backbone.
|
||||
|
||||
| Architecture | Backbone | Example HF Models |
|
||||
|---|---|---|
|
||||
| `ColQwen3` | Qwen3-VL | `TomoroAI/tomoro-colqwen3-embed-4b`, `TomoroAI/tomoro-colqwen3-embed-8b` |
|
||||
| `OpsColQwen3Model` | Qwen3-VL | `OpenSearch-AI/Ops-Colqwen3-4B`, `OpenSearch-AI/Ops-Colqwen3-8B` |
|
||||
|
||||
Start the server:
|
||||
|
||||
```shell
|
||||
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
|
||||
```
|
||||
|
||||
Then you can use the rerank endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks."
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Or the score endpoint:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"text_1": "What is the capital of France?",
|
||||
"text_2": ["The capital of France is Paris.", "Python is a programming language."]
|
||||
}'
|
||||
```
|
||||
|
||||
You can also get the raw token embeddings using the pooling endpoint with `token_embed` task:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"input": "What is machine learning?",
|
||||
"task": "token_embed"
|
||||
}'
|
||||
```
|
||||
|
||||
For **image inputs**, use the chat-style `messages` field so that the vLLM multimodal processor handles them correctly:
|
||||
|
||||
```shell
|
||||
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
|
||||
"model": "TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
|
||||
{"type": "text", "text": "Describe the image."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Examples can be found here:
|
||||
|
||||
- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
|
||||
- Reranking: [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
|
||||
|
||||
### BAAI/bge-m3
|
||||
|
||||
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
|
||||
|
||||
130
examples/pooling/score/colqwen3_rerank_online.py
Normal file
130
examples/pooling/score/colqwen3_rerank_online.py
Normal file
@@ -0,0 +1,130 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Example of using ColQwen3 late interaction model for reranking.
|
||||
|
||||
ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL.
|
||||
It produces per-token embeddings and uses MaxSim scoring for retrieval
|
||||
and reranking. Supports both text and image inputs.
|
||||
|
||||
Start the server with:
|
||||
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 50000
|
||||
|
||||
Then run this script:
|
||||
python colqwen3_rerank_online.py
|
||||
"""
|
||||
|
||||
import requests
|
||||
|
||||
MODEL = "TomoroAI/tomoro-colqwen3-embed-4b"
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
||||
|
||||
|
||||
def rerank_text():
|
||||
"""Text-only reranking via /rerank endpoint."""
|
||||
print("=" * 60)
|
||||
print("1. Text reranking (/rerank)")
|
||||
print("=" * 60)
|
||||
|
||||
data = {
|
||||
"model": MODEL,
|
||||
"query": "What is machine learning?",
|
||||
"documents": [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"Python is a programming language.",
|
||||
"Deep learning uses neural networks for complex tasks.",
|
||||
"The weather today is sunny.",
|
||||
],
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print("\n Ranked documents (most relevant first):")
|
||||
for item in result["results"]:
|
||||
doc_idx = item["index"]
|
||||
score = item["relevance_score"]
|
||||
print(f" [{score:.4f}] {data['documents'][doc_idx]}")
|
||||
else:
|
||||
print(f" Request failed: {response.status_code}")
|
||||
print(f" {response.text[:300]}")
|
||||
|
||||
|
||||
def score_text():
|
||||
"""Text-only scoring via /score endpoint."""
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("2. Text scoring (/score)")
|
||||
print("=" * 60)
|
||||
|
||||
query = "What is the capital of France?"
|
||||
documents = [
|
||||
"The capital of France is Paris.",
|
||||
"Berlin is the capital of Germany.",
|
||||
"Python is a programming language.",
|
||||
]
|
||||
|
||||
data = {
|
||||
"model": MODEL,
|
||||
"text_1": query,
|
||||
"text_2": documents,
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f"\n Query: {query}\n")
|
||||
for item in result["data"]:
|
||||
idx = item["index"]
|
||||
score = item["score"]
|
||||
print(f" Doc {idx} (score={score:.4f}): {documents[idx]}")
|
||||
else:
|
||||
print(f" Request failed: {response.status_code}")
|
||||
print(f" {response.text[:300]}")
|
||||
|
||||
|
||||
def score_text_top_n():
|
||||
"""Text reranking with top_n filtering via /rerank endpoint."""
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("3. Text reranking with top_n=2 (/rerank)")
|
||||
print("=" * 60)
|
||||
|
||||
data = {
|
||||
"model": MODEL,
|
||||
"query": "What is the capital of France?",
|
||||
"documents": [
|
||||
"The capital of France is Paris.",
|
||||
"Berlin is the capital of Germany.",
|
||||
"Python is a programming language.",
|
||||
"The Eiffel Tower is in Paris.",
|
||||
],
|
||||
"top_n": 2,
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f"\n Top {data['top_n']} results:")
|
||||
for item in result["results"]:
|
||||
doc_idx = item["index"]
|
||||
score = item["relevance_score"]
|
||||
print(f" [{score:.4f}] {data['documents'][doc_idx]}")
|
||||
else:
|
||||
print(f" Request failed: {response.status_code}")
|
||||
print(f" {response.text[:300]}")
|
||||
|
||||
|
||||
def main():
|
||||
rerank_text()
|
||||
score_text()
|
||||
score_text_top_n()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
198
examples/pooling/token_embed/colqwen3_token_embed_online.py
Normal file
198
examples/pooling/token_embed/colqwen3_token_embed_online.py
Normal file
@@ -0,0 +1,198 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
|
||||
"""
|
||||
Example online usage of Pooling API for ColQwen3 multi-vector retrieval.
|
||||
|
||||
ColQwen3 is a multi-modal late interaction model based on Qwen3-VL that
|
||||
produces per-token embeddings (320-dim, L2-normalized) for both text and
|
||||
image inputs. Similarity is computed via MaxSim scoring.
|
||||
|
||||
This example mirrors the official TomoroAI inference code
|
||||
(https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-4b) but uses the
|
||||
vLLM serving API instead of local HuggingFace model loading.
|
||||
|
||||
Start the server with:
|
||||
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
|
||||
|
||||
Then run this script:
|
||||
python colqwen3_token_embed_online.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
from io import BytesIO
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
# ── Helpers ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def post_http_request(payload: dict, api_url: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
return requests.post(api_url, headers=headers, json=payload)
|
||||
|
||||
|
||||
def load_image(url: str) -> Image.Image:
|
||||
"""Download an image from URL (handles Wikimedia 403)."""
|
||||
for hdrs in ({}, {"User-Agent": "Mozilla/5.0 (compatible; ColQwen3-demo/1.0)"}):
|
||||
resp = requests.get(url, headers=hdrs, timeout=10)
|
||||
if resp.status_code == 403:
|
||||
continue
|
||||
resp.raise_for_status()
|
||||
return Image.open(BytesIO(resp.content)).convert("RGB")
|
||||
raise RuntimeError(f"Could not fetch image from {url}")
|
||||
|
||||
|
||||
def encode_image_base64(image: Image.Image) -> str:
|
||||
"""Encode a PIL image to a base64 data URI."""
|
||||
buf = BytesIO()
|
||||
image.save(buf, format="PNG")
|
||||
return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()
|
||||
|
||||
|
||||
def compute_maxsim(q_emb: np.ndarray, d_emb: np.ndarray) -> float:
|
||||
"""Compute ColBERT-style MaxSim score between query and document."""
|
||||
sim = q_emb @ d_emb.T
|
||||
return float(sim.max(axis=-1).sum())
|
||||
|
||||
|
||||
# ── Encode functions ────────────────────────────────────────
|
||||
|
||||
|
||||
def encode_queries(texts: list[str], model: str, api_url: str) -> list[np.ndarray]:
|
||||
"""Encode text queries → list of multi-vector embeddings."""
|
||||
resp = post_http_request({"model": model, "input": texts}, api_url)
|
||||
return [np.array(item["data"]) for item in resp.json()["data"]]
|
||||
|
||||
|
||||
def encode_images(image_urls: list[str], model: str, api_url: str) -> list[np.ndarray]:
|
||||
"""Encode image documents → list of multi-vector embeddings.
|
||||
|
||||
Images are sent via the chat-style `messages` field so that the
|
||||
vLLM multimodal processor handles them correctly.
|
||||
"""
|
||||
embeddings = []
|
||||
for url in image_urls:
|
||||
print(f" Loading: {url.split('/')[-1]}...")
|
||||
image = load_image(url)
|
||||
image_uri = encode_image_base64(image)
|
||||
resp = post_http_request(
|
||||
{
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_uri}},
|
||||
{"type": "text", "text": "Describe the image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
},
|
||||
api_url,
|
||||
)
|
||||
result = resp.json()
|
||||
if resp.status_code != 200 or "data" not in result:
|
||||
print(f" Error ({resp.status_code}): {str(result)[:200]}")
|
||||
continue
|
||||
embeddings.append(np.array(result["data"][0]["data"]))
|
||||
return embeddings
|
||||
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
pooling_url = f"http://{args.host}:{args.port}/pooling"
|
||||
score_url = f"http://{args.host}:{args.port}/score"
|
||||
model = args.model
|
||||
|
||||
# Same sample data as the official TomoroAI example
|
||||
queries = [
|
||||
"Retrieve the city of Singapore",
|
||||
"Retrieve the city of Beijing",
|
||||
"Retrieve the city of London",
|
||||
]
|
||||
image_urls = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/2/27/Singapore_skyline_2022.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/6/61/Beijing_skyline_at_night.JPG",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/4/49/London_skyline.jpg",
|
||||
]
|
||||
|
||||
# ── 1) Text query embeddings ────────────────────────────
|
||||
print("=" * 60)
|
||||
print("1. Encode text queries (multi-vector)")
|
||||
print("=" * 60)
|
||||
query_embeddings = encode_queries(queries, model, pooling_url)
|
||||
for i, emb in enumerate(query_embeddings):
|
||||
norm = float(np.linalg.norm(emb[0]))
|
||||
print(f' Query {i}: {emb.shape} (L2 norm: {norm:.4f}) "{queries[i]}"')
|
||||
|
||||
# ── 2) Image document embeddings ────────────────────────
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("2. Encode image documents (multi-vector)")
|
||||
print("=" * 60)
|
||||
doc_embeddings = encode_images(image_urls, model, pooling_url)
|
||||
for i, emb in enumerate(doc_embeddings):
|
||||
print(f" Doc {i}: {emb.shape} {image_urls[i].split('/')[-1]}")
|
||||
|
||||
# ── 3) Cross-modal MaxSim scoring ───────────────────────
|
||||
if doc_embeddings:
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("3. Cross-modal MaxSim scores (text queries × image docs)")
|
||||
print("=" * 60)
|
||||
# Header
|
||||
print(f"{'':>35s}", end="")
|
||||
for j in range(len(doc_embeddings)):
|
||||
print(f" Doc {j:>2d}", end="")
|
||||
print()
|
||||
# Score matrix
|
||||
for i, q_emb in enumerate(query_embeddings):
|
||||
print(f" {queries[i]:<33s}", end="")
|
||||
for j, d_emb in enumerate(doc_embeddings):
|
||||
score = compute_maxsim(q_emb, d_emb)
|
||||
print(f" {score:6.2f}", end="")
|
||||
print()
|
||||
|
||||
# ── 4) Text-only /score endpoint ────────────────────────
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("4. Text-only late interaction scoring (/score endpoint)")
|
||||
print("=" * 60)
|
||||
text_query = "What is the capital of France?"
|
||||
text_docs = [
|
||||
"The capital of France is Paris.",
|
||||
"Berlin is the capital of Germany.",
|
||||
"Python is a programming language.",
|
||||
]
|
||||
resp = post_http_request(
|
||||
{"model": model, "text_1": text_query, "text_2": text_docs},
|
||||
score_url,
|
||||
)
|
||||
print(f' Query: "{text_query}"\n')
|
||||
for item in resp.json()["data"]:
|
||||
idx = item["index"]
|
||||
print(f" Doc {idx} (score={item['score']:.4f}): {text_docs[idx]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
156
tests/models/multimodal/pooling/test_colqwen3.py
Normal file
156
tests/models/multimodal/pooling/test_colqwen3.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for ColQwen3 late interaction model for multi-modal retrieval.
|
||||
|
||||
ColQwen3 is a multi-vector retrieval model based on Qwen3-VL backbone with
|
||||
ColBERT-style late interaction scoring (MaxSim). It produces per-token
|
||||
embeddings for both text and image inputs.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
|
||||
MODELS = [
|
||||
"TomoroAI/tomoro-colqwen3-embed-4b",
|
||||
"OpenSearch-AI/Ops-Colqwen3-4B",
|
||||
]
|
||||
|
||||
EMBED_DIMS = {
|
||||
"TomoroAI/tomoro-colqwen3-embed-4b": 320,
|
||||
"OpenSearch-AI/Ops-Colqwen3-4B": 2560,
|
||||
}
|
||||
|
||||
TEXT_QUERIES = [
|
||||
"What is the capital of France?",
|
||||
"Describe the contents of the document.",
|
||||
]
|
||||
|
||||
TEXT_DOCUMENTS = [
|
||||
"The capital of France is Paris.",
|
||||
"This document contains important financial data.",
|
||||
]
|
||||
|
||||
DTYPE = "half"
|
||||
|
||||
|
||||
def _run_token_embed_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Verify per-token embedding shape and L2 normalization."""
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
|
||||
|
||||
assert len(outputs) == 1
|
||||
emb = torch.tensor(outputs[0])
|
||||
# Token embeddings should be 2D: [num_tokens, embed_dim]
|
||||
assert emb.dim() == 2
|
||||
assert emb.shape[1] == EMBED_DIMS[model]
|
||||
assert emb.shape[0] > 1
|
||||
|
||||
# Verify L2 normalization
|
||||
norms = torch.norm(emb, p=2, dim=-1)
|
||||
torch.testing.assert_close(
|
||||
norms,
|
||||
torch.ones_like(norms),
|
||||
rtol=1e-2,
|
||||
atol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
def _run_late_interaction_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Verify MaxSim scoring matches manual computation."""
|
||||
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
q_outputs = vllm_model.token_embed([TEXT_QUERIES[0]])
|
||||
d_outputs = vllm_model.token_embed([TEXT_DOCUMENTS[0]])
|
||||
|
||||
q_emb = torch.tensor(q_outputs[0])
|
||||
d_emb = torch.tensor(d_outputs[0])
|
||||
|
||||
manual_score = compute_maxsim_score(q_emb, d_emb).item()
|
||||
|
||||
vllm_scores = vllm_model.score(TEXT_QUERIES[0], TEXT_DOCUMENTS[0])
|
||||
|
||||
assert len(vllm_scores) == 1
|
||||
assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01)
|
||||
|
||||
|
||||
def _run_relevance_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
"""Verify that relevant documents score higher than irrelevant ones."""
|
||||
query = "What is machine learning?"
|
||||
documents = [
|
||||
"Machine learning is a subset of artificial intelligence.",
|
||||
"The weather forecast shows rain tomorrow.",
|
||||
"Deep learning uses neural networks for complex tasks.",
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
scores = vllm_model.score(query, documents)
|
||||
|
||||
assert len(scores) == 3
|
||||
assert scores[0] > scores[1], "ML doc should score higher than weather doc"
|
||||
assert scores[2] > scores[1], "DL doc should score higher than weather doc"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [DTYPE])
|
||||
def test_colqwen3_token_embed(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
_run_token_embed_test(vllm_runner, model, dtype=dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [DTYPE])
|
||||
def test_colqwen3_late_interaction_scoring(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
_run_late_interaction_test(vllm_runner, model, dtype=dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [DTYPE])
|
||||
def test_colqwen3_relevance_ordering(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
_run_relevance_test(vllm_runner, model, dtype=dtype)
|
||||
@@ -597,6 +597,12 @@ _EMBEDDING_EXAMPLE_MODELS = {
|
||||
"TIGER-Lab/VLM2Vec-Full", trust_remote_code=True
|
||||
),
|
||||
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"),
|
||||
"ColQwen3": _HfExamplesInfo(
|
||||
"TomoroAI/tomoro-colqwen3-embed-4b", trust_remote_code=True
|
||||
),
|
||||
"OpsColQwen3Model": _HfExamplesInfo(
|
||||
"OpenSearch-AI/Ops-Colqwen3-4B", trust_remote_code=True
|
||||
),
|
||||
"SiglipModel": _HfExamplesInfo("google/siglip-base-patch16-224"),
|
||||
"PrithviGeoSpatialMAE": _HfExamplesInfo(
|
||||
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
|
||||
|
||||
306
vllm/model_executor/models/colqwen3.py
Normal file
306
vllm/model_executor/models/colqwen3.py
Normal file
@@ -0,0 +1,306 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
ColQwen3 late interaction model for multi-modal retrieval and reranking.
|
||||
|
||||
ColQwen3 extends Qwen3-VL with a ColBERT-style late interaction head,
|
||||
producing per-token embeddings for both text and image inputs. It uses
|
||||
MaxSim scoring for retrieval/reranking tasks.
|
||||
|
||||
This model supports the "token_embed" pooling task and is designed for
|
||||
multi-vector retrieval of documents containing both text and images.
|
||||
|
||||
Reference: https://arxiv.org/abs/2407.01449 (ColPali)
|
||||
Based on: Qwen3-VL backbone with custom text projection
|
||||
|
||||
Target models:
|
||||
- TomoroAI/tomoro-colqwen3-embed-8b
|
||||
- OpenSearch-AI/Ops-Colqwen3-4B
|
||||
"""
|
||||
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import ClassVar, Literal
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers.models.qwen3_vl import Qwen3VLProcessor
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from .interfaces_base import default_pooling_type
|
||||
from .qwen2_vl import Qwen2VLMultiModalDataParser
|
||||
from .qwen3_vl import (
|
||||
Qwen3VLDummyInputsBuilder,
|
||||
Qwen3VLForConditionalGeneration,
|
||||
Qwen3VLMultiModalProcessor,
|
||||
Qwen3VLProcessingInfo,
|
||||
)
|
||||
from .utils import AutoWeightsLoader, WeightsMapper
|
||||
|
||||
|
||||
class ColQwen3ProcessingInfo(Qwen3VLProcessingInfo):
|
||||
"""Processing info for ColQwen3 models.
|
||||
|
||||
ColQwen3 models (TomoroAI, OpenSearch-AI, etc.) use custom HuggingFace
|
||||
configs (e.g. ColQwen3Config, OpsColQwen3Config) that are not instances
|
||||
of Qwen3VLConfig. We override get_hf_config() and get_hf_processor()
|
||||
to skip the strict type check, similar to OpenCUAProcessingInfo.
|
||||
"""
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config()
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor:
|
||||
# Force standard Qwen3VLProcessor even when trust_remote_code=True.
|
||||
# ColQwen3 custom processors (e.g. ColQwen3Processor) have
|
||||
# incompatible interfaces with vLLM's Qwen3VLMultiModalProcessor.
|
||||
# The standard Qwen3VLProcessor handles both text and image inputs
|
||||
# correctly for the Qwen3-VL backbone.
|
||||
return self.ctx.get_hf_processor(
|
||||
Qwen3VLProcessor,
|
||||
use_fast=kwargs.pop("use_fast", True),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def _supports_video(self) -> bool:
|
||||
"""Check if the HF processor supports video inputs."""
|
||||
return hasattr(self.get_hf_processor(), "video_processor")
|
||||
|
||||
def get_video_processor(self, **kwargs: object):
|
||||
if not self._supports_video:
|
||||
raise AttributeError(
|
||||
f"The processor for {self.ctx.model_config.model} does not "
|
||||
"support video inputs (no video_processor attribute)."
|
||||
)
|
||||
return self.get_hf_processor(**kwargs).video_processor # type: ignore[attr-defined]
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
limits: dict[str, int | None] = {"image": None}
|
||||
if self._supports_video:
|
||||
limits["video"] = None
|
||||
return limits
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
max_image_tokens = self.get_max_image_tokens()
|
||||
result: dict[str, int] = {"image": max_image_tokens}
|
||||
if self._supports_video:
|
||||
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
|
||||
result["video"] = max_video_tokens
|
||||
return result
|
||||
|
||||
def get_data_parser(self):
|
||||
hf_config = self.get_hf_config()
|
||||
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
||||
return Qwen2VLMultiModalDataParser(
|
||||
spatial_merge_size,
|
||||
video_needs_metadata=self._supports_video,
|
||||
expected_hidden_size=self._get_expected_hidden_size(),
|
||||
)
|
||||
|
||||
|
||||
@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
Qwen3VLMultiModalProcessor,
|
||||
info=ColQwen3ProcessingInfo,
|
||||
dummy_inputs=Qwen3VLDummyInputsBuilder,
|
||||
)
|
||||
class ColQwen3Model(
|
||||
Qwen3VLForConditionalGeneration,
|
||||
):
|
||||
"""ColQwen3 late interaction model for multi-modal retrieval/reranking.
|
||||
|
||||
This model extends Qwen3VLForConditionalGeneration with a ColBERT-style
|
||||
linear projection layer for per-token embeddings. It supports:
|
||||
- "token_embed" task: Per-token embeddings for late interaction scoring
|
||||
|
||||
The model produces L2-normalized per-token embeddings by:
|
||||
1. Running the Qwen3-VL backbone (vision + language) to get hidden states
|
||||
2. Projecting hidden states through a linear layer (hidden_size -> embed_dim)
|
||||
3. L2-normalizing the projected embeddings
|
||||
|
||||
ColBERT-style MaxSim scoring is computed externally, either client-side
|
||||
or via the late interaction scoring path in ServingScores.
|
||||
|
||||
Attributes:
|
||||
custom_text_proj: Linear projection from hidden_size to embed_dim
|
||||
supports_late_interaction: Flag indicating this model uses late
|
||||
interaction scoring
|
||||
"""
|
||||
|
||||
# Mark this as a pooling model so vLLM routes to pooler path
|
||||
is_pooling_model = True
|
||||
|
||||
# Mark this model as supporting late interaction scoring
|
||||
supports_late_interaction: ClassVar[Literal[True]] = True
|
||||
|
||||
# Override hf_to_vllm_mapper to handle ColQwen3 weight naming.
|
||||
# NOTE: WeightsMapper applies ALL matching prefix rules sequentially
|
||||
# (no early exit), so more-specific prefixes must come first.
|
||||
# TomoroAI: "vlm.model.visual.", "vlm.model.language_model."
|
||||
# ColPali: "model.visual.", "model.language_model."
|
||||
# OpenSearch: "visual.", "language_model." (no outer prefix,
|
||||
# re-prefixed to "model.*" in load_weights)
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_prefix={
|
||||
# TomoroAI naming convention (most specific first)
|
||||
"vlm.model.visual.": "visual.",
|
||||
"vlm.lm_head.": "language_model.lm_head.",
|
||||
"vlm.model.language_model.": "language_model.model.",
|
||||
# ColPali / nvidia naming convention
|
||||
"model.visual.": "visual.",
|
||||
"lm_head.": "language_model.lm_head.",
|
||||
# OpenSearch-AI: after re-prefix, "language_model.model.*"
|
||||
# becomes "model.language_model.model.*" — handle this before
|
||||
# the shorter "model.language_model." rule to avoid double map
|
||||
"model.language_model.model.": "language_model.model.",
|
||||
"model.language_model.": "language_model.model.",
|
||||
}
|
||||
)
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
head_dtype = vllm_config.model_config.head_dtype
|
||||
|
||||
hidden_size = getattr(config, "hidden_size", None)
|
||||
if hidden_size is None and hasattr(config, "text_config"):
|
||||
hidden_size = config.text_config.hidden_size
|
||||
if hidden_size is None:
|
||||
raise ValueError(
|
||||
"Unable to determine text hidden size from config. "
|
||||
"Expected 'hidden_size' or 'text_config.hidden_size'."
|
||||
)
|
||||
self._proj_hidden_size = hidden_size
|
||||
|
||||
# (TomoroAI: embed_dim, OpenSearch: dims, ColPali: dim)
|
||||
self.embed_dim: int | None = (
|
||||
getattr(config, "embed_dim", None)
|
||||
or getattr(config, "dims", None)
|
||||
or getattr(config, "dim", None)
|
||||
or getattr(config, "projection_dim", None)
|
||||
or getattr(config, "colbert_dim", None)
|
||||
)
|
||||
|
||||
# Build the projection layer if embed_dim is known
|
||||
if self.embed_dim is not None:
|
||||
self.custom_text_proj = nn.Linear(
|
||||
hidden_size,
|
||||
self.embed_dim,
|
||||
bias=False,
|
||||
dtype=head_dtype,
|
||||
)
|
||||
else:
|
||||
# Will be created during load_weights when dim is inferred
|
||||
self.custom_text_proj = None
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
self.pooler = pooler_for_token_embed(
|
||||
pooler_config,
|
||||
projector=None,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor | None,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors=None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
**kwargs: object,
|
||||
) -> torch.Tensor:
|
||||
"""Run forward pass producing per-token embeddings."""
|
||||
hidden_states = super().forward(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
intermediate_tensors=intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not isinstance(hidden_states, torch.Tensor):
|
||||
return hidden_states # type: ignore
|
||||
|
||||
proj_dtype = self.custom_text_proj.weight.dtype # type: ignore
|
||||
if hidden_states.dtype != proj_dtype:
|
||||
hidden_states = hidden_states.to(proj_dtype)
|
||||
|
||||
# Project to embedding dimension and L2 normalize
|
||||
proj = self.custom_text_proj(hidden_states) # type: ignore
|
||||
return torch.nn.functional.normalize(proj, p=2, dim=-1)
|
||||
|
||||
# Names used for the projection layer across different ColQwen3 variants
|
||||
_PROJ_LAYER_NAMES = {
|
||||
"custom_text_proj", # ColPali naming
|
||||
"embedding_proj_layer", # TomoroAI naming
|
||||
}
|
||||
|
||||
def _is_proj_weight(self, name: str) -> bool:
|
||||
"""Check if a weight name belongs to the projection layer."""
|
||||
return any(proj_name in name for proj_name in self._PROJ_LAYER_NAMES)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
|
||||
"""Load weights with special handling for ColQwen3 projection layer."""
|
||||
weights_list = list(weights)
|
||||
proj_weights: list[tuple[str, torch.Tensor]] = []
|
||||
model_weights: list[tuple[str, torch.Tensor]] = []
|
||||
|
||||
# Scan all weight names to determine if re-prefixing is needed.
|
||||
# OpenSearch-AI models have unprefixed weights ("language_model.*",
|
||||
# "visual.*") that need "model." added so hf_to_vllm_mapper can
|
||||
# process them. Only re-prefix if ALL backbone weights are
|
||||
# unprefixed (no "vlm." or "model." prefix found).
|
||||
has_unprefixed = any(
|
||||
name.startswith("language_model.") or name.startswith("visual.")
|
||||
for name, _ in weights_list
|
||||
)
|
||||
has_prefixed = any(
|
||||
name.startswith("vlm.") or name.startswith("model.")
|
||||
for name, _ in weights_list
|
||||
)
|
||||
needs_reprefix = has_unprefixed and not has_prefixed
|
||||
|
||||
for name, weight in weights_list:
|
||||
if self._is_proj_weight(name):
|
||||
proj_weights.append((name, weight))
|
||||
else:
|
||||
if needs_reprefix and not self._is_proj_weight(name):
|
||||
name = "model." + name
|
||||
model_weights.append((name, weight))
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
loaded = loader.load_weights(model_weights, mapper=self.hf_to_vllm_mapper)
|
||||
|
||||
if proj_weights:
|
||||
model_dtype = next(self.language_model.parameters()).dtype
|
||||
model_device = next(self.language_model.parameters()).device
|
||||
|
||||
for name, weight in proj_weights:
|
||||
if self.embed_dim is None and "weight" in name:
|
||||
self.embed_dim = weight.shape[0]
|
||||
has_bias = any("bias" in n for n, _ in proj_weights)
|
||||
self.custom_text_proj = nn.Linear(
|
||||
self._proj_hidden_size,
|
||||
self.embed_dim,
|
||||
bias=has_bias,
|
||||
dtype=model_dtype,
|
||||
)
|
||||
self.custom_text_proj.to(model_device)
|
||||
|
||||
if self.custom_text_proj is not None:
|
||||
param_name = name.split(".")[-1]
|
||||
param = getattr(self.custom_text_proj, param_name, None)
|
||||
if param is not None:
|
||||
weight = weight.to(device=param.device, dtype=param.dtype)
|
||||
default_weight_loader(param, weight)
|
||||
loaded.add(f"custom_text_proj.{param_name}")
|
||||
|
||||
return loaded
|
||||
@@ -254,6 +254,8 @@ _EMBEDDING_MODELS = {
|
||||
),
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
|
||||
"ColQwen3": ("colqwen3", "ColQwen3Model"),
|
||||
"OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
|
||||
"SiglipModel": ("siglip", "SiglipEmbeddingModel"),
|
||||
# Technically Terratorch models work on images, both in
|
||||
# input and output. I am adding it here because it piggy-backs on embedding
|
||||
|
||||
@@ -74,6 +74,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
|
||||
afmoe="AfmoeConfig",
|
||||
bagel="BagelConfig",
|
||||
chatglm="ChatGLMConfig",
|
||||
colqwen3="ColQwen3Config",
|
||||
ops_colqwen3="OpsColQwen3Config",
|
||||
deepseek_vl_v2="DeepseekVLV2Config",
|
||||
deepseek_v32="DeepseekV3Config",
|
||||
flex_olmo="FlexOlmoConfig",
|
||||
|
||||
@@ -18,6 +18,9 @@ _CLASS_TO_MODULE: dict[str, str] = {
|
||||
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
|
||||
"BagelConfig": "vllm.transformers_utils.configs.bagel",
|
||||
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
|
||||
"ColQwen3Config": "vllm.transformers_utils.configs.colqwen3",
|
||||
"OpsColQwen3Config": "vllm.transformers_utils.configs.colqwen3",
|
||||
"Qwen3VLNemotronEmbedConfig": "vllm.transformers_utils.configs.colqwen3",
|
||||
"DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
|
||||
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
|
||||
"EAGLEConfig": "vllm.transformers_utils.configs.eagle",
|
||||
@@ -68,6 +71,9 @@ __all__ = [
|
||||
"AfmoeConfig",
|
||||
"BagelConfig",
|
||||
"ChatGLMConfig",
|
||||
"ColQwen3Config",
|
||||
"OpsColQwen3Config",
|
||||
"Qwen3VLNemotronEmbedConfig",
|
||||
"DeepseekVLV2Config",
|
||||
"DeepseekV3Config",
|
||||
"DotsOCRConfig",
|
||||
|
||||
58
vllm/transformers_utils/configs/colqwen3.py
Normal file
58
vllm/transformers_utils/configs/colqwen3.py
Normal file
@@ -0,0 +1,58 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
ColQwen3 configuration that extends Qwen3VLConfig with embedding projection
|
||||
fields. This allows ColQwen3 models to be loaded without trust_remote_code
|
||||
by mapping their custom model_type (colqwen3, ops_colqwen3, etc.) to a
|
||||
standard config class that vLLM understands.
|
||||
|
||||
Supported model_types:
|
||||
- colqwen3 (TomoroAI/tomoro-colqwen3-embed-8b)
|
||||
- ops_colqwen3 (OpenSearch-AI/Ops-Colqwen3-4B)
|
||||
- qwen3_vl_nemotron_embed (nvidia/nemotron-colembed-vl-8b-v2)
|
||||
"""
|
||||
|
||||
from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
|
||||
|
||||
|
||||
class ColQwen3Config(Qwen3VLConfig):
|
||||
"""Configuration class for ColQwen3 models.
|
||||
|
||||
Extends Qwen3VLConfig with additional fields used by ColQwen3 variants
|
||||
for the embedding projection layer.
|
||||
"""
|
||||
|
||||
# Accept any ColQwen3 variant model_type
|
||||
model_type = "colqwen3"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int | None = None,
|
||||
dims: int | None = None,
|
||||
dim: int | None = None,
|
||||
projection_dim: int | None = None,
|
||||
colbert_dim: int | None = None,
|
||||
pooling: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
# Store embedding projection config fields
|
||||
self.embed_dim = embed_dim
|
||||
self.dims = dims
|
||||
self.dim = dim
|
||||
self.projection_dim = projection_dim
|
||||
self.colbert_dim = colbert_dim
|
||||
self.pooling = pooling
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class OpsColQwen3Config(ColQwen3Config):
|
||||
"""Configuration for OpenSearch-AI ColQwen3 variants."""
|
||||
|
||||
model_type = "ops_colqwen3"
|
||||
|
||||
|
||||
class Qwen3VLNemotronEmbedConfig(ColQwen3Config):
|
||||
"""Configuration for NVIDIA Nemotron ColEmbed variants."""
|
||||
|
||||
model_type = "qwen3_vl_nemotron_embed"
|
||||
Reference in New Issue
Block a user