feat: Add ColBERT late interaction model support (#33686)

Signed-off-by: Ilya Boytsov <ilyaboytsov1805@gmail.com> Signed-off-by: Ilya Boytsov <boytsovpanamera@mail.ru> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-05 01:05:13 +01:00
parent fa4e0fb028
commit 439afa4eea
13 changed files with 974 additions and 3 deletions
--- a/tests/models/language/pooling/test_colbert.py
+++ b/tests/models/language/pooling/test_colbert.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for ColBERT late interaction scoring."""
+
+import pytest
+import torch
+
+from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
+
+# ColBERT model - using answerai-colbert-small-v1 as it's a smaller model
+# suitable for testing (based on BERT-base)
+COLBERT_MODEL = "answerdotai/answerai-colbert-small-v1"
+COLBERT_DIM = 96  # This model uses 96-dimensional output
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+DTYPE = "half"
+
+
+@pytest.fixture(scope="module")
+def colbert_model_name():
+    return COLBERT_MODEL
+
+
+def test_colbert_token_embed(vllm_runner, colbert_model_name):
+    """Test that ColBERT model produces token embeddings."""
+    with vllm_runner(
+        colbert_model_name,
+        runner="pooling",
+        dtype=DTYPE,
+        max_model_len=512,
+        enforce_eager=True,
+    ) as vllm_model:
+        # Get token embeddings for a single text
+        outputs = vllm_model.token_embed([TEXTS_1[0]])
+
+        assert len(outputs) == 1
+        # Token embeddings should be 2D: [num_tokens, colbert_dim]
+        emb = torch.tensor(outputs[0])
+        assert emb.dim() == 2
+        assert emb.shape[1] == COLBERT_DIM
+        # Should have at least a few tokens
+        assert emb.shape[0] > 1
+
+
+def test_colbert_late_interaction_1_to_1(vllm_runner, colbert_model_name):
+    """Test ColBERT late interaction scoring with 1:1 query-document pair."""
+    with vllm_runner(
+        colbert_model_name,
+        runner="pooling",
+        dtype=DTYPE,
+        max_model_len=512,
+        enforce_eager=True,
+    ) as vllm_model:
+        # Get token embeddings
+        q_outputs = vllm_model.token_embed([TEXTS_1[0]])
+        d_outputs = vllm_model.token_embed([TEXTS_2[0]])
+
+        q_emb = torch.tensor(q_outputs[0])
+        d_emb = torch.tensor(d_outputs[0])
+
+        # Compute MaxSim manually
+        manual_score = compute_maxsim_score(q_emb, d_emb).item()
+
+        # Use the score API (which should internally use _late_interaction_score)
+        vllm_scores = vllm_model.score(TEXTS_1[0], TEXTS_2[0])
+
+        assert len(vllm_scores) == 1
+        assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01)
+
+
+def test_colbert_late_interaction_1_to_N(vllm_runner, colbert_model_name):
+    """Test ColBERT late interaction scoring with 1:N query-documents."""
+    with vllm_runner(
+        colbert_model_name,
+        runner="pooling",
+        dtype=DTYPE,
+        max_model_len=512,
+        enforce_eager=True,
+    ) as vllm_model:
+        # Get token embeddings
+        q_outputs = vllm_model.token_embed([TEXTS_1[0]])
+        d_outputs = vllm_model.token_embed(TEXTS_2)
+
+        q_emb = torch.tensor(q_outputs[0])
+
+        # Compute MaxSim manually for each document
+        manual_scores = []
+        for d_out in d_outputs:
+            d_emb = torch.tensor(d_out)
+            manual_scores.append(compute_maxsim_score(q_emb, d_emb).item())
+
+        # Use the score API
+        vllm_scores = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+        assert len(vllm_scores) == 2
+        for i in range(2):
+            assert vllm_scores[i] == pytest.approx(manual_scores[i], rel=0.01)
+
+
+def test_colbert_late_interaction_N_to_N(vllm_runner, colbert_model_name):
+    """Test ColBERT late interaction scoring with N:N query-documents."""
+    with vllm_runner(
+        colbert_model_name,
+        runner="pooling",
+        dtype=DTYPE,
+        max_model_len=512,
+        enforce_eager=True,
+    ) as vllm_model:
+        # Get token embeddings
+        q_outputs = vllm_model.token_embed(TEXTS_1)
+        d_outputs = vllm_model.token_embed(TEXTS_2)
+
+        # Compute MaxSim manually for each pair
+        manual_scores = []
+        for q_out, d_out in zip(q_outputs, d_outputs):
+            q_emb = torch.tensor(q_out)
+            d_emb = torch.tensor(d_out)
+            manual_scores.append(compute_maxsim_score(q_emb, d_emb).item())
+
+        # Use the score API
+        vllm_scores = vllm_model.score(TEXTS_1, TEXTS_2)
+
+        assert len(vllm_scores) == 2
+        for i in range(2):
+            assert vllm_scores[i] == pytest.approx(manual_scores[i], rel=0.01)
+
+
+def test_colbert_relevance_ordering(vllm_runner, colbert_model_name):
+    """Test that ColBERT scores relevant documents higher than irrelevant ones."""
+    query = "What is machine learning?"
+    documents = [
+        "Machine learning is a subset of artificial intelligence.",
+        "Python is a programming language.",
+        "Deep learning uses neural networks.",
+    ]
+
+    with vllm_runner(
+        colbert_model_name,
+        runner="pooling",
+        dtype=DTYPE,
+        max_model_len=512,
+        enforce_eager=True,
+    ) as vllm_model:
+        scores = vllm_model.score(query, documents)
+
+        assert len(scores) == 3
+        # ML-related documents should score higher than unrelated Python doc
+        # Document 0 (ML definition) should be most relevant
+        # Document 2 (Deep learning) should also be relevant
+        # Document 1 (Python) should be least relevant
+        assert scores[0] > scores[1], "ML doc should score higher than Python doc"
+        assert scores[2] > scores[1], "DL doc should score higher than Python doc"
+
+
+def test_colbert_embed_not_supported(vllm_runner, colbert_model_name):
+    """Test that ColBERT model does not support 'embed' task."""
+    with (
+        vllm_runner(
+            colbert_model_name,
+            runner="pooling",
+            dtype=DTYPE,
+            max_model_len=512,
+            enforce_eager=True,
+        ) as vllm_model,
+        pytest.raises(ValueError, match="Embedding API is not supported"),
+    ):
+        vllm_model.embed([TEXTS_1[0]])
+
+
+def test_colbert_hf_comparison(vllm_runner, colbert_model_name):
+    """Test that vLLM ColBERT produces same embeddings as HuggingFace."""
+    import torch.nn.functional as F
+    from huggingface_hub import hf_hub_download
+    from safetensors.torch import load_file
+    from transformers import AutoTokenizer, BertModel
+
+    test_texts = [TEXTS_1[0], TEXTS_2[0]]
+
+    # Get vLLM embeddings first (to avoid GPU memory contention)
+    # Use fp32 to match HuggingFace default precision for fair comparison
+    with vllm_runner(
+        colbert_model_name,
+        runner="pooling",
+        dtype="float32",
+        max_model_len=512,
+        enforce_eager=True,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(test_texts)
+
+    # Get HuggingFace reference embeddings on CPU
+    # Load the base BERT model and manually apply the ColBERT linear projection
+    hf_tokenizer = AutoTokenizer.from_pretrained(colbert_model_name)
+    hf_bert = BertModel.from_pretrained(colbert_model_name)
+    hf_bert.eval()
+
+    # Load the ColBERT linear weights from safetensors
+    weights_path = hf_hub_download(colbert_model_name, filename="model.safetensors")
+    weights = load_file(weights_path)
+    linear_weight = weights["linear.weight"]  # [96, 384]
+
+    hf_embeddings = []
+    for text in test_texts:
+        inputs = hf_tokenizer(text, return_tensors="pt")
+        with torch.no_grad():
+            outputs = hf_bert(**inputs)
+            # Get last hidden state: [1, seq_len, 384]
+            hidden_states = outputs.last_hidden_state
+            # Apply ColBERT linear projection: [1, seq_len, 96]
+            token_emb = F.linear(hidden_states, linear_weight)
+            # L2 normalize
+            token_emb = F.normalize(token_emb, p=2, dim=-1)
+            hf_embeddings.append(token_emb.squeeze(0).float())
+
+    # Compare embeddings
+    for i, (hf_emb, vllm_out) in enumerate(zip(hf_embeddings, vllm_outputs)):
+        vllm_emb = torch.tensor(vllm_out).float()
+
+        # Print first few components for debugging
+        print(f"\n=== Text {i}: '{test_texts[i][:30]}...' ===")
+        print(f"HF shape: {hf_emb.shape}, vLLM shape: {vllm_emb.shape}")
+        print(f"HF first token, first 10 dims:   {hf_emb[0, :10].tolist()}")
+        print(f"vLLM first token, first 10 dims: {vllm_emb[0, :10].tolist()}")
+        print(f"HF last token, first 10 dims:    {hf_emb[-1, :10].tolist()}")
+        print(f"vLLM last token, first 10 dims:  {vllm_emb[-1, :10].tolist()}")
+
+        # Should have same shape
+        assert hf_emb.shape == vllm_emb.shape, (
+            f"Shape mismatch for text {i}: HF {hf_emb.shape} vs vLLM {vllm_emb.shape}"
+        )
+
+        # Should have same values (with tolerance for fp16)
+        torch.testing.assert_close(
+            vllm_emb,
+            hf_emb,
+            rtol=1e-2,
+            atol=1e-2,
+            msg=f"Embedding mismatch for text {i}",
+        )