examples/pooling/score/colqwen3_rerank_online.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using ColQwen3 late interaction model for reranking.

ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL.
It produces per-token embeddings and uses MaxSim scoring for retrieval
and reranking. Supports both text and image inputs.

Start the server with:
    vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 50000

Then run this script:
    python colqwen3_rerank_online.py
"""

import requests

MODEL = "TomoroAI/tomoro-colqwen3-embed-4b"
BASE_URL = "http://127.0.0.1:8000"

headers = {"accept": "application/json", "Content-Type": "application/json"}


def rerank_text():
    """Text-only reranking via /rerank endpoint."""
    print("=" * 60)
    print("1. Text reranking (/rerank)")
    print("=" * 60)

    data = {
        "model": MODEL,
        "query": "What is machine learning?",
        "documents": [
            "Machine learning is a subset of artificial intelligence.",
            "Python is a programming language.",
            "Deep learning uses neural networks for complex tasks.",
            "The weather today is sunny.",
        ],
    }

    response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)

    if response.status_code == 200:
        result = response.json()
        print("\n  Ranked documents (most relevant first):")
        for item in result["results"]:
            doc_idx = item["index"]
            score = item["relevance_score"]
            print(f"    [{score:.4f}] {data['documents'][doc_idx]}")
    else:
        print(f"  Request failed: {response.status_code}")
        print(f"  {response.text[:300]}")


def score_text():
    """Text-only scoring via /score endpoint."""
    print()
    print("=" * 60)
    print("2. Text scoring (/score)")
    print("=" * 60)

    query = "What is the capital of France?"
    documents = [
        "The capital of France is Paris.",
        "Berlin is the capital of Germany.",
        "Python is a programming language.",
    ]

    data = {
        "model": MODEL,
        "text_1": query,
        "text_2": documents,
    }

    response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)

    if response.status_code == 200:
        result = response.json()
        print(f"\n  Query: {query}\n")
        for item in result["data"]:
            idx = item["index"]
            score = item["score"]
            print(f"    Doc {idx} (score={score:.4f}): {documents[idx]}")
    else:
        print(f"  Request failed: {response.status_code}")
        print(f"  {response.text[:300]}")


def score_text_top_n():
    """Text reranking with top_n filtering via /rerank endpoint."""
    print()
    print("=" * 60)
    print("3. Text reranking with top_n=2 (/rerank)")
    print("=" * 60)

    data = {
        "model": MODEL,
        "query": "What is the capital of France?",
        "documents": [
            "The capital of France is Paris.",
            "Berlin is the capital of Germany.",
            "Python is a programming language.",
            "The Eiffel Tower is in Paris.",
        ],
        "top_n": 2,
    }

    response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)

    if response.status_code == 200:
        result = response.json()
        print(f"\n  Top {data['top_n']} results:")
        for item in result["results"]:
            doc_idx = item["index"]
            score = item["relevance_score"]
            print(f"    [{score:.4f}] {data['documents'][doc_idx]}")
    else:
        print(f"  Request failed: {response.status_code}")
        print(f"  {response.text[:300]}")


def main():
    rerank_text()
    score_text()
    score_text_top_n()


if __name__ == "__main__":
    main()
[new model] add COLQwen3 code & Inference (#34398) Signed-off-by: craftsangjae <craftsangjae@gmail.com> Signed-off-by: katacoder <craftsangjae@gmail.com> 2026-02-14 13:15:19 +09:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""`
			`Example of using ColQwen3 late interaction model for reranking.`

			`ColQwen3 is a multi-modal ColBERT-style model based on Qwen3-VL.`
			`It produces per-token embeddings and uses MaxSim scoring for retrieval`
			`and reranking. Supports both text and image inputs.`

			`Start the server with:`
			`vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 50000`

			`Then run this script:`
			`python colqwen3_rerank_online.py`
			`"""`

			`import requests`

			`MODEL = "TomoroAI/tomoro-colqwen3-embed-4b"`
			`BASE_URL = "http://127.0.0.1:8000"`

			`headers = {"accept": "application/json", "Content-Type": "application/json"}`


			`def rerank_text():`
			`"""Text-only reranking via /rerank endpoint."""`
			`print("=" * 60)`
			`print("1. Text reranking (/rerank)")`
			`print("=" * 60)`

			`data = {`
			`"model": MODEL,`
			`"query": "What is machine learning?",`
			`"documents": [`
			`"Machine learning is a subset of artificial intelligence.",`
			`"Python is a programming language.",`
			`"Deep learning uses neural networks for complex tasks.",`
			`"The weather today is sunny.",`
			`],`
			`}`

			`response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)`

			`if response.status_code == 200:`
			`result = response.json()`
			`print("\n Ranked documents (most relevant first):")`
			`for item in result["results"]:`
			`doc_idx = item["index"]`
			`score = item["relevance_score"]`
			`print(f" [{score:.4f}] {data['documents'][doc_idx]}")`
			`else:`
			`print(f" Request failed: {response.status_code}")`
			`print(f" {response.text[:300]}")`


			`def score_text():`
			`"""Text-only scoring via /score endpoint."""`
			`print()`
			`print("=" * 60)`
			`print("2. Text scoring (/score)")`
			`print("=" * 60)`

			`query = "What is the capital of France?"`
			`documents = [`
			`"The capital of France is Paris.",`
			`"Berlin is the capital of Germany.",`
			`"Python is a programming language.",`
			`]`

			`data = {`
			`"model": MODEL,`
			`"text_1": query,`
			`"text_2": documents,`
			`}`

			`response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)`

			`if response.status_code == 200:`
			`result = response.json()`
			`print(f"\n Query: {query}\n")`
			`for item in result["data"]:`
			`idx = item["index"]`
			`score = item["score"]`
			`print(f" Doc {idx} (score={score:.4f}): {documents[idx]}")`
			`else:`
			`print(f" Request failed: {response.status_code}")`
			`print(f" {response.text[:300]}")`


			`def score_text_top_n():`
			`"""Text reranking with top_n filtering via /rerank endpoint."""`
			`print()`
			`print("=" * 60)`
			`print("3. Text reranking with top_n=2 (/rerank)")`
			`print("=" * 60)`

			`data = {`
			`"model": MODEL,`
			`"query": "What is the capital of France?",`
			`"documents": [`
			`"The capital of France is Paris.",`
			`"Berlin is the capital of Germany.",`
			`"Python is a programming language.",`
			`"The Eiffel Tower is in Paris.",`
			`],`
			`"top_n": 2,`
			`}`

			`response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)`

			`if response.status_code == 200:`
			`result = response.json()`
			`print(f"\n Top {data['top_n']} results:")`
			`for item in result["results"]:`
			`doc_idx = item["index"]`
			`score = item["relevance_score"]`
			`print(f" [{score:.4f}] {data['documents'][doc_idx]}")`
			`else:`
			`print(f" Request failed: {response.status_code}")`
			`print(f" {response.text[:300]}")`


			`def main():`
			`rerank_text()`
			`score_text()`
			`score_text_top_n()`


			`if __name__ == "__main__":`
			`main()`