Files
vllm/examples/pooling/score/colqwen3_rerank_online.py
Kata Coder d1ea65d0a1 [new model] add COLQwen3 code & Inference (#34398)
Signed-off-by: craftsangjae <craftsangjae@gmail.com>
Signed-off-by: katacoder <craftsangjae@gmail.com>
2026-02-14 12:15:19 +08:00

131 lines
3.7 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using ColQwen3 late interaction model for reranking.
ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL.
It produces per-token embeddings and uses MaxSim scoring for retrieval
and reranking. Supports both text and image inputs.
Start the server with:
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 50000
Then run this script:
python colqwen3_rerank_online.py
"""
import requests
MODEL = "TomoroAI/tomoro-colqwen3-embed-4b"
BASE_URL = "http://127.0.0.1:8000"
headers = {"accept": "application/json", "Content-Type": "application/json"}
def rerank_text():
"""Text-only reranking via /rerank endpoint."""
print("=" * 60)
print("1. Text reranking (/rerank)")
print("=" * 60)
data = {
"model": MODEL,
"query": "What is machine learning?",
"documents": [
"Machine learning is a subset of artificial intelligence.",
"Python is a programming language.",
"Deep learning uses neural networks for complex tasks.",
"The weather today is sunny.",
],
}
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print("\n Ranked documents (most relevant first):")
for item in result["results"]:
doc_idx = item["index"]
score = item["relevance_score"]
print(f" [{score:.4f}] {data['documents'][doc_idx]}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
def score_text():
"""Text-only scoring via /score endpoint."""
print()
print("=" * 60)
print("2. Text scoring (/score)")
print("=" * 60)
query = "What is the capital of France?"
documents = [
"The capital of France is Paris.",
"Berlin is the capital of Germany.",
"Python is a programming language.",
]
data = {
"model": MODEL,
"text_1": query,
"text_2": documents,
}
response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print(f"\n Query: {query}\n")
for item in result["data"]:
idx = item["index"]
score = item["score"]
print(f" Doc {idx} (score={score:.4f}): {documents[idx]}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
def score_text_top_n():
"""Text reranking with top_n filtering via /rerank endpoint."""
print()
print("=" * 60)
print("3. Text reranking with top_n=2 (/rerank)")
print("=" * 60)
data = {
"model": MODEL,
"query": "What is the capital of France?",
"documents": [
"The capital of France is Paris.",
"Berlin is the capital of Germany.",
"Python is a programming language.",
"The Eiffel Tower is in Paris.",
],
"top_n": 2,
}
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
if response.status_code == 200:
result = response.json()
print(f"\n Top {data['top_n']} results:")
for item in result["results"]:
doc_idx = item["index"]
score = item["relevance_score"]
print(f" [{score:.4f}] {data['documents'][doc_idx]}")
else:
print(f" Request failed: {response.status_code}")
print(f" {response.text[:300]}")
def main():
rerank_text()
score_text()
score_text_top_n()
if __name__ == "__main__":
main()