# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example of using ColBERT late interaction models for reranking and scoring. ColBERT (Contextualized Late Interaction over BERT) uses per-token embeddings and MaxSim scoring for document reranking, providing better accuracy than single-vector models while being more efficient than cross-encoders. vLLM supports ColBERT with multiple encoder backbones. Start the server with one of the following: # BERT backbone (works out of the box) vllm serve answerdotai/answerai-colbert-small-v1 # ModernBERT backbone vllm serve lightonai/GTE-ModernColBERT-v1 \ --hf-overrides '{"architectures": ["ColBERTModernBertModel"]}' # Jina XLM-RoBERTa backbone vllm serve jinaai/jina-colbert-v2 \ --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \ --trust-remote-code Then run this script: python colbert_rerank_online.py """ import json import requests # Change this to match the model you started the server with MODEL = "answerdotai/answerai-colbert-small-v1" BASE_URL = "http://127.0.0.1:8000" headers = {"accept": "application/json", "Content-Type": "application/json"} documents = [ "Machine learning is a subset of artificial intelligence.", "Python is a programming language.", "Deep learning uses neural networks for complex tasks.", "The weather today is sunny.", ] def rerank_example(): """Use the /rerank endpoint to rank documents by query relevance.""" print("=== Rerank Example ===") data = { "model": MODEL, "query": "What is machine learning?", "documents": documents, } response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data) result = response.json() print(json.dumps(result, indent=2)) print("\nRanked documents (most relevant first):") for item in result["results"]: doc_idx = item["index"] score = item["relevance_score"] print(f" Score {score:.4f}: {documents[doc_idx]}") def score_example(): """Use the /score endpoint for pairwise query-document scoring.""" print("\n=== Score Example ===") data = { "model": MODEL, "text_1": "What is machine learning?", "text_2": [ "Machine learning is a subset of AI.", "The weather is sunny.", ], } response = requests.post(f"{BASE_URL}/score", headers=headers, json=data) result = response.json() print(json.dumps(result, indent=2)) def main(): rerank_example() score_example() if __name__ == "__main__": main()