vllm/examples/pooling/score/colbert_rerank_online.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example of using ColBERT late interaction model for reranking.

ColBERT (Contextualized Late Interaction over BERT) uses per-token embeddings
and MaxSim scoring for document reranking, providing better accuracy than
single-vector models while being more efficient than cross-encoders.

Start the server with:
    vllm serve answerdotai/answerai-colbert-small-v1

Then run this script:
    python colbert_rerank_online.py
"""

import json

import requests

url = "http://127.0.0.1:8000/rerank"

headers = {"accept": "application/json", "Content-Type": "application/json"}

data = {
    "model": "answerdotai/answerai-colbert-small-v1",
    "query": "What is machine learning?",
    "documents": [
        "Machine learning is a subset of artificial intelligence.",
        "Python is a programming language.",
        "Deep learning uses neural networks for complex tasks.",
        "The weather today is sunny.",
    ],
}


def main():
    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        print("ColBERT Rerank Request successful!")
        result = response.json()
        print(json.dumps(result, indent=2))

        # Show ranked results
        print("\nRanked documents (most relevant first):")
        for item in result["results"]:
            doc_idx = item["index"]
            score = item["relevance_score"]
            print(f"  Score {score:.4f}: {data['documents'][doc_idx]}")
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(response.text)


if __name__ == "__main__":
    main()