[Frontend] Support multimodal inputs for late-interaction scoring (ColQwen3) + NewModel: nvidia/nemotron-colembed (#34574)

Signed-off-by: craftsangjae <craftsangjae@gmail.com>
This commit is contained in:
Kata Coder
2026-02-21 13:01:40 +09:00
committed by GitHub
parent 11be2c74dc
commit 5719a4e4e6
10 changed files with 532 additions and 66 deletions

View File

@@ -1,7 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example of using ColQwen3 late interaction model for reranking.
Example of using ColQwen3 late interaction model for reranking and scoring.
ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL.
It produces per-token embeddings and uses MaxSim scoring for retrieval
@@ -14,13 +15,65 @@ Then run this script:
python colqwen3_rerank_online.py
"""
import base64
from io import BytesIO
import requests
from PIL import Image
MODEL = "TomoroAI/tomoro-colqwen3-embed-4b"
BASE_URL = "http://127.0.0.1:8000"
headers = {"accept": "application/json", "Content-Type": "application/json"}
# ── Image helpers ──────────────────────────────────────────
def load_image(url: str) -> Image.Image:
"""Download an image from URL (handles Wikimedia 403)."""
for hdrs in (
{},
{"User-Agent": "Mozilla/5.0 (compatible; ColQwen3-demo/1.0)"},
):
resp = requests.get(url, headers=hdrs, timeout=15)
if resp.status_code == 403:
continue
resp.raise_for_status()
return Image.open(BytesIO(resp.content)).convert("RGB")
raise RuntimeError(f"Could not fetch image from {url}")
def encode_image_base64(image: Image.Image) -> str:
"""Encode a PIL image to a base64 data URI."""
buf = BytesIO()
image.save(buf, format="PNG")
return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()
def make_image_content(image_url: str, text: str = "Describe the image.") -> dict:
"""Build a ScoreMultiModalParam dict from an image URL."""
image = load_image(image_url)
return {
"content": [
{
"type": "image_url",
"image_url": {"url": encode_image_base64(image)},
},
{"type": "text", "text": text},
]
}
# ── Sample image URLs ─────────────────────────────────────
IMAGE_URLS = {
"beijing": "https://upload.wikimedia.org/wikipedia/commons/6/61/Beijing_skyline_at_night.JPG",
"london": "https://upload.wikimedia.org/wikipedia/commons/4/49/London_skyline.jpg",
"singapore": "https://upload.wikimedia.org/wikipedia/commons/2/27/Singapore_skyline_2022.jpg",
}
# ── Text-only examples ────────────────────────────────────
def rerank_text():
"""Text-only reranking via /rerank endpoint."""
@@ -120,11 +173,86 @@ def score_text_top_n():
print(f" {response.text[:300]}")
# ── Multi-modal examples (text query × image documents) ──
def score_text_vs_images():
"""Score a text query against image documents via /score."""
print()
print("=" * 60)
print("4. Multi-modal scoring: text query vs image docs (/score)")
print("=" * 60)
query = "Retrieve the city of Beijing"
labels = list(IMAGE_URLS.keys())
print(f"\n Loading {len(labels)} images...")
image_contents = [make_image_content(IMAGE_URLS[name]) for name in labels]
data = {
"model": MODEL,
"data_1": query,
"data_2": image_contents,
}
response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print(f'\n Query: "{query}"\n')
for item in result["data"]:
idx = item["index"]
print(f" Doc {idx} [{labels[idx]}] score={item['score']:.4f}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
def rerank_text_vs_images():
"""Rerank image documents by a text query via /rerank."""
print()
print("=" * 60)
print("5. Multi-modal reranking: text query vs image docs (/rerank)")
print("=" * 60)
query = "Retrieve the city of London"
labels = list(IMAGE_URLS.keys())
print(f"\n Loading {len(labels)} images...")
image_contents = [make_image_content(IMAGE_URLS[name]) for name in labels]
data = {
"model": MODEL,
"query": query,
"documents": image_contents,
"top_n": 2,
}
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print(f'\n Query: "{query}"')
print(f" Top {data['top_n']} results:\n")
for item in result["results"]:
idx = item["index"]
print(f" [{item['relevance_score']:.4f}] {labels[idx]}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
# ── Main ──────────────────────────────────────────────────
def main():
# Text-only
rerank_text()
score_text()
score_text_top_n()
# Multi-modal (text query × image documents)
score_text_vs_images()
rerank_text_vs_images()
if __name__ == "__main__":
main()