2026-02-05 01:05:13 +01:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
"""
|
2026-02-13 10:53:09 +01:00
|
|
|
Example of using ColBERT late interaction models for reranking and scoring.
|
2026-02-05 01:05:13 +01:00
|
|
|
|
|
|
|
|
ColBERT (Contextualized Late Interaction over BERT) uses per-token embeddings
|
|
|
|
|
and MaxSim scoring for document reranking, providing better accuracy than
|
|
|
|
|
single-vector models while being more efficient than cross-encoders.
|
|
|
|
|
|
2026-02-13 10:53:09 +01:00
|
|
|
vLLM supports ColBERT with multiple encoder backbones. Start the server
|
|
|
|
|
with one of the following:
|
|
|
|
|
|
|
|
|
|
# BERT backbone (works out of the box)
|
2026-02-05 01:05:13 +01:00
|
|
|
vllm serve answerdotai/answerai-colbert-small-v1
|
|
|
|
|
|
2026-02-13 10:53:09 +01:00
|
|
|
# ModernBERT backbone
|
|
|
|
|
vllm serve lightonai/GTE-ModernColBERT-v1 \
|
|
|
|
|
--hf-overrides '{"architectures": ["ColBERTModernBertModel"]}'
|
|
|
|
|
|
|
|
|
|
# Jina XLM-RoBERTa backbone
|
|
|
|
|
vllm serve jinaai/jina-colbert-v2 \
|
|
|
|
|
--hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
|
|
|
|
|
--trust-remote-code
|
|
|
|
|
|
2026-02-05 01:05:13 +01:00
|
|
|
Then run this script:
|
|
|
|
|
python colbert_rerank_online.py
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
2026-02-13 10:53:09 +01:00
|
|
|
# Change this to match the model you started the server with
|
|
|
|
|
MODEL = "answerdotai/answerai-colbert-small-v1"
|
|
|
|
|
BASE_URL = "http://127.0.0.1:8000"
|
2026-02-05 01:05:13 +01:00
|
|
|
|
|
|
|
|
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
|
|
|
|
|
2026-02-13 10:53:09 +01:00
|
|
|
documents = [
|
|
|
|
|
"Machine learning is a subset of artificial intelligence.",
|
|
|
|
|
"Python is a programming language.",
|
|
|
|
|
"Deep learning uses neural networks for complex tasks.",
|
|
|
|
|
"The weather today is sunny.",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rerank_example():
|
|
|
|
|
"""Use the /rerank endpoint to rank documents by query relevance."""
|
|
|
|
|
print("=== Rerank Example ===")
|
|
|
|
|
|
|
|
|
|
data = {
|
|
|
|
|
"model": MODEL,
|
|
|
|
|
"query": "What is machine learning?",
|
|
|
|
|
"documents": documents,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
response = requests.post(f"{BASE_URL}/rerank", headers=headers, json=data)
|
|
|
|
|
result = response.json()
|
|
|
|
|
print(json.dumps(result, indent=2))
|
|
|
|
|
|
|
|
|
|
print("\nRanked documents (most relevant first):")
|
|
|
|
|
for item in result["results"]:
|
|
|
|
|
doc_idx = item["index"]
|
|
|
|
|
score = item["relevance_score"]
|
|
|
|
|
print(f" Score {score:.4f}: {documents[doc_idx]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def score_example():
|
|
|
|
|
"""Use the /score endpoint for pairwise query-document scoring."""
|
|
|
|
|
print("\n=== Score Example ===")
|
|
|
|
|
|
|
|
|
|
data = {
|
|
|
|
|
"model": MODEL,
|
|
|
|
|
"text_1": "What is machine learning?",
|
|
|
|
|
"text_2": [
|
|
|
|
|
"Machine learning is a subset of AI.",
|
|
|
|
|
"The weather is sunny.",
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
response = requests.post(f"{BASE_URL}/score", headers=headers, json=data)
|
|
|
|
|
result = response.json()
|
|
|
|
|
print(json.dumps(result, indent=2))
|
2026-02-05 01:05:13 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
2026-02-13 10:53:09 +01:00
|
|
|
rerank_example()
|
|
|
|
|
score_example()
|
2026-02-05 01:05:13 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|