[CI] Reorganize scoring tests (#38207)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-03-26 20:07:01 +08:00
parent f2d16207c7
commit dcdc145893
20 changed files with 1595 additions and 975 deletions
--- a/tests/entrypoints/pooling/basic/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@@ -85,7 +85,7 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
    expected_message = (
        "truncate_prompt_tokens value is "
        "greater than max_model_len."
-        " Please, select a smaller truncation size."
+        " Please request a smaller truncation size."
    )
    assert error_details["message"] == expected_message

--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -288,7 +288,7 @@ async def test_truncate_prompt_tokens(client: openai.AsyncOpenAI, model_name: st
        assert "error" in response.object
        assert (
            "truncate_prompt_tokens value is greater than max_model_len. "
-            "Please, select a smaller truncation size." in response.message
+            "Please request a smaller truncation size." in response.message
        )


--- a/tests/entrypoints/pooling/score/test_offline.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@@ -1,69 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import weakref
-
-import pytest
-import torch
-
-from tests.models.utils import softmax
-from vllm import LLM, PoolingParams
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.platforms import current_platform
-
-MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
-
-
-@pytest.fixture(scope="module")
-def llm():
-    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
-    # that supports encoder-only models on ROCm.
-    attention_config = None
-    if current_platform.is_rocm():
-        attention_config = {"backend": "FLEX_ATTENTION"}
-
-    # pytest caches the fixture so we use weakref.proxy to
-    # enable garbage collection
-    llm = LLM(
-        model=MODEL_NAME,
-        max_num_batched_tokens=32768,
-        tensor_parallel_size=1,
-        gpu_memory_utilization=0.75,
-        enforce_eager=True,
-        seed=0,
-        attention_config=attention_config,
-    )
-
-    yield weakref.proxy(llm)
-
-    del llm
-
-    cleanup_dist_env_and_memory()
-
-
-def test_pooling_params(llm: LLM):
-    def get_outputs(use_activation):
-        queries = "What is the capital of France?"
-        documents = "The capital of France is Paris."
-
-        outputs = llm.score(
-            queries,
-            documents,
-            pooling_params=PoolingParams(use_activation=use_activation),
-            use_tqdm=False,
-        )
-        return torch.tensor([x.outputs.score for x in outputs])
-
-    default = get_outputs(use_activation=None)
-    w_activation = get_outputs(use_activation=True)
-    wo_activation = get_outputs(use_activation=False)
-
-    assert torch.allclose(default, w_activation, atol=1e-2), (
-        "Default should use activation."
-    )
-    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
-        "wo_activation should not use activation."
-    )
-    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
-        "w_activation should be close to activation(wo_activation)."
-    )
--- a/tests/entrypoints/pooling/score/test_online_colbert.py
+++ b/tests/entrypoints/pooling/score/test_online_colbert.py
@@ -1,142 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Online API tests for ColBERT late interaction scoring."""
-
-import pytest
-import requests
-
-from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
-
-MODEL_NAME = "answerdotai/answerai-colbert-small-v1"
-COLBERT_DIM = 96
-MAX_MODEL_LEN = 512
-
-
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        "--max-model-len",
-        str(MAX_MODEL_LEN),
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-class TestColBERTOnline:
-    def test_rerank(self, server: RemoteOpenAIServer):
-        """Test ColBERT rerank endpoint."""
-        query = "What is the capital of France?"
-        documents = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris.",
-        ]
-
-        rerank_response = requests.post(
-            server.url_for("rerank"),
-            json={
-                "model": MODEL_NAME,
-                "query": query,
-                "documents": documents,
-            },
-        )
-        rerank_response.raise_for_status()
-        rerank = RerankResponse.model_validate(rerank_response.json())
-
-        assert rerank.id is not None
-        assert rerank.results is not None
-        assert len(rerank.results) == 2
-
-        paris_result = next(r for r in rerank.results if r.index == 1)
-        brazil_result = next(r for r in rerank.results if r.index == 0)
-
-        assert paris_result.relevance_score > brazil_result.relevance_score
-
-    def test_rerank_top_n(self, server: RemoteOpenAIServer):
-        """Test ColBERT rerank with top_n parameter."""
-        query = "What is the capital of France?"
-        documents = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris.",
-            "Machine learning is a field of AI.",
-        ]
-
-        rerank_response = requests.post(
-            server.url_for("rerank"),
-            json={
-                "model": MODEL_NAME,
-                "query": query,
-                "documents": documents,
-                "top_n": 2,
-            },
-        )
-        rerank_response.raise_for_status()
-        rerank = RerankResponse.model_validate(rerank_response.json())
-
-        assert len(rerank.results) == 2
-        assert rerank.results[0].index == 1
-
-    def test_score(self, server: RemoteOpenAIServer):
-        """Test ColBERT score endpoint."""
-        text_1 = "What is the capital of France?"
-        text_2 = ["The capital of France is Paris.", "Python is a language."]
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": MODEL_NAME,
-                "text_1": text_1,
-                "text_2": text_2,
-            },
-        )
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 2
-
-        assert score.data[0].score > score.data[1].score
-
-    def test_token_embed(self, server: RemoteOpenAIServer):
-        """Test ColBERT token_embed task via pooling endpoint."""
-        text = "What is the capital of France?"
-
-        pooling_response = requests.post(
-            server.url_for("pooling"),
-            json={
-                "model": MODEL_NAME,
-                "input": text,
-                "task": "token_embed",
-            },
-        )
-        pooling_response.raise_for_status()
-        pooling = pooling_response.json()
-
-        assert "data" in pooling
-        assert len(pooling["data"]) == 1
-
-        embeddings = pooling["data"][0]["data"]
-        assert isinstance(embeddings, list)
-        assert len(embeddings) > 0
-        assert len(embeddings[0]) == COLBERT_DIM
-
-    def test_embed_not_supported(self, server: RemoteOpenAIServer):
-        """Test that ColBERT model does not support 'embed' task."""
-        task = "embed"
-        text = "What is the capital of France?"
-
-        response = requests.post(
-            server.url_for("pooling"),
-            json={
-                "model": MODEL_NAME,
-                "input": text,
-                "task": task,
-            },
-        )
-
-        assert response.json()["error"]["type"] == "BadRequestError"
-        assert response.json()["error"]["message"].startswith(
-            f"Unsupported task: {task!r}"
-        )
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@@ -1,240 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import requests
-import torch
-import torch.nn.functional as F
-
-from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
-from vllm.entrypoints.pooling.score.protocol import RerankResponse
-from vllm.platforms import current_platform
-
-MODEL_NAME = "BAAI/bge-reranker-base"
-DTYPE = "bfloat16"
-input_text = "This product was excellent and exceeded my expectations"
-input_tokens = [0, 3293, 12996, 509, 40881, 136, 204839, 297, 759, 202702, 2]
-
-
-@pytest.fixture(scope="module")
-def server():
-    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
-
-    # ROCm: Use Flex Attention to support encoder-only self-attention.
-    if current_platform.is_rocm():
-        args.extend(["--attention-backend", "FLEX_ATTENTION"])
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_basic(server: RemoteOpenAIServer, model_name: str):
-    # test /v1/models
-    response = requests.get(server.url_for("/v1/models"))
-    served_model = response.json()["data"][0]["id"]
-    assert served_model == MODEL_NAME
-
-    # test /tokenize
-    response = requests.post(
-        server.url_for("/tokenize"),
-        json={"model": model_name, "prompt": input_text},
-    )
-    assert response.json()["tokens"] == input_tokens
-
-
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
-    query = "What is the capital of France?"
-    documents = [
-        "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.",
-    ]
-
-    rerank_response = requests.post(
-        server.url_for("rerank"),
-        json={
-            "model": model_name,
-            "query": query,
-            "documents": documents,
-        },
-    )
-    rerank_response.raise_for_status()
-    rerank = RerankResponse.model_validate(rerank_response.json())
-
-    assert rerank.id is not None
-    assert rerank.results is not None
-    assert len(rerank.results) == 2
-    assert rerank.results[0].relevance_score >= 0.9
-    assert rerank.results[1].relevance_score <= 0.01
-
-
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_top_n(server: RemoteOpenAIServer, model_name: str):
-    query = "What is the capital of France?"
-    documents = [
-        "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.",
-        "Cross-encoder models are neat",
-    ]
-
-    rerank_response = requests.post(
-        server.url_for("rerank"),
-        json={"model": model_name, "query": query, "documents": documents, "top_n": 2},
-    )
-    rerank_response.raise_for_status()
-    rerank = RerankResponse.model_validate(rerank_response.json())
-
-    assert rerank.id is not None
-    assert rerank.results is not None
-    assert len(rerank.results) == 2
-    assert rerank.results[0].relevance_score >= 0.9
-    assert rerank.results[1].relevance_score <= 0.01
-
-
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
-    query = "What is the capital of France?" * 100
-    documents = [
-        "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.",
-    ]
-
-    rerank_response = requests.post(
-        server.url_for("rerank"),
-        json={"model": model_name, "query": query, "documents": documents},
-    )
-    assert rerank_response.status_code == 400
-    # Assert just a small fragments of the response
-    assert "Please reduce the length of the input." in rerank_response.text
-
-
-def test_invocations(server: RemoteOpenAIServer):
-    query = "What is the capital of France?"
-    documents = [
-        "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.",
-    ]
-
-    request_args = {
-        "model": MODEL_NAME,
-        "query": query,
-        "documents": documents,
-    }
-
-    rerank_response = requests.post(server.url_for("rerank"), json=request_args)
-    rerank_response.raise_for_status()
-
-    invocation_response = requests.post(
-        server.url_for("invocations"), json=request_args
-    )
-    invocation_response.raise_for_status()
-
-    rerank_output = rerank_response.json()
-    invocation_output = invocation_response.json()
-
-    assert rerank_output.keys() == invocation_output.keys()
-    for rerank_result, invocations_result in zip(
-        rerank_output["results"], invocation_output["results"]
-    ):
-        assert rerank_result.keys() == invocations_result.keys()
-        assert rerank_result["relevance_score"] == pytest.approx(
-            invocations_result["relevance_score"], rel=0.05
-        )
-        # TODO: reset this tolerance to 0.01 once we find
-        # an alternative to flash_attn with bfloat16
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
-    async def get_outputs(use_activation):
-        query = "What is the capital of France?"
-        documents = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris.",
-        ]
-
-        response = requests.post(
-            server.url_for("rerank"),
-            json={
-                "model": model_name,
-                "query": query,
-                "documents": documents,
-                "use_activation": use_activation,
-            },
-        )
-        outputs = response.json()
-
-        return torch.tensor([x["relevance_score"] for x in outputs["results"]])
-
-    default = await get_outputs(use_activation=None)
-    w_activation = await get_outputs(use_activation=True)
-    wo_activation = await get_outputs(use_activation=False)
-
-    assert torch.allclose(default, w_activation, atol=1e-2), (
-        "Default should use activation."
-    )
-    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
-        "wo_activation should not use activation."
-    )
-    assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
-        "w_activation should be close to activation(wo_activation)."
-    )
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
-    response = requests.post(
-        server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_text,
-            "encoding_format": "float",
-            "task": "classify",
-        },
-    )
-    poolings = PoolingResponse.model_validate(response.json())
-    assert len(poolings.data) == 1
-    assert len(poolings.data[0].data) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
-    response = requests.post(
-        server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "task": "token_classify",
-            "input": input_text,
-            "encoding_format": "float",
-        },
-    )
-
-    poolings = PoolingResponse.model_validate(response.json())
-
-    assert len(poolings.data) == 1
-    assert len(poolings.data[0].data) == len(input_tokens)
-    assert len(poolings.data[0].data[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
-async def test_pooling_not_supported(
-    server: RemoteOpenAIServer, model_name: str, task: str
-):
-    response = requests.post(
-        server.url_for("pooling"),
-        json={
-            "model": model_name,
-            "input": input_text,
-            "encoding_format": "float",
-            "task": task,
-        },
-    )
-    assert response.json()["error"]["type"] == "BadRequestError"
-    assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")
--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -1,342 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
-
-import pytest
-import requests
-import torch
-import torch.nn.functional as F
-from torch import tensor
-
-from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.pooling.score.protocol import ScoreResponse
-from vllm.platforms import current_platform
-
-MODELS = [
-    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
-    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
-]
-DTYPE = "half"
-
-
-def run_transformers(hf_model, model, text_pairs):
-    if model["is_cross_encoder"]:
-        return hf_model.predict(text_pairs).tolist()
-    else:
-        hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
-        return [
-            F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
-            for pair in hf_embeddings
-        ]
-
-
-@pytest.fixture(scope="class", params=MODELS)
-def model(request):
-    yield request.param
-
-
-@pytest.fixture(scope="class")
-def server(model: dict[str, Any]):
-    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
-
-    # ROCm: Use Flex Attention to support encoder-only self-attention.
-    if current_platform.is_rocm():
-        args.extend(["--attention-backend", "FLEX_ATTENTION"])
-
-    with RemoteOpenAIServer(model["name"], args) as remote_server:
-        yield remote_server
-
-
-@pytest.fixture(scope="class")
-def runner(model: dict[str, Any], hf_runner):
-    kwargs = {
-        "dtype": DTYPE,
-        "is_cross_encoder"
-        if model["is_cross_encoder"]
-        else "is_sentence_transformer": True,
-    }
-
-    with hf_runner(model["name"], **kwargs) as hf_model:
-        yield hf_model
-
-
-class TestModel:
-    def test_queries_str_documents_str(
-        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
-    ):
-        queries = "What is the capital of France?"
-        documents = "The capital of France is Paris."
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "queries": queries,
-                "documents": documents,
-            },
-        )
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 1
-
-        vllm_outputs = [d.score for d in score.data]
-
-        text_pairs = [[queries, documents]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-
-    def test_queries_str_items_str(
-        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
-    ):
-        queries = "What is the capital of France?"
-        items = "The capital of France is Paris."
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "queries": queries,
-                "items": items,
-            },
-        )
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 1
-
-        vllm_outputs = [d.score for d in score.data]
-
-        text_pairs = [[queries, items]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-
-    def test_text_1_str_text_2_str(
-        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
-    ):
-        text_1 = "What is the capital of France?"
-        text_2 = "The capital of France is Paris."
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "text_1": text_1,
-                "text_2": text_2,
-            },
-        )
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 1
-
-        vllm_outputs = [d.score for d in score.data]
-
-        text_pairs = [[text_1, text_2]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-
-    def test_data_1_str_data_2_str(
-        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
-    ):
-        data_1 = "What is the capital of France?"
-        data_2 = "The capital of France is Paris."
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "data_1": data_1,
-                "data_2": data_2,
-            },
-        )
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 1
-
-        vllm_outputs = [d.score for d in score.data]
-
-        text_pairs = [[data_1, data_2]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-
-    def test_queries_str_documents_list(
-        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
-    ):
-        queries = "What is the capital of France?"
-        documents = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris.",
-        ]
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "queries": queries,
-                "documents": documents,
-            },
-        )
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 2
-
-        vllm_outputs = [d.score for d in score.data]
-
-        text_pairs = [[queries, documents[0]], [queries, documents[1]]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-
-    def test_queries_list_documents_list(
-        self, server: RemoteOpenAIServer, model: dict[str, Any], runner
-    ):
-        queries = [
-            "What is the capital of the United States?",
-            "What is the capital of France?",
-        ]
-        documents = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris.",
-        ]
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "queries": queries,
-                "documents": documents,
-            },
-        )
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 2
-
-        vllm_outputs = [d.score for d in score.data]
-
-        text_pairs = [[queries[0], documents[0]], [queries[1], documents[1]]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-
-    def test_score_max_model_len(
-        self, server: RemoteOpenAIServer, model: dict[str, Any]
-    ):
-        queries = "What is the capital of France?" * 20
-        documents = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris.",
-        ]
-
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "queries": queries,
-                "documents": documents,
-            },
-        )
-        assert score_response.status_code == 400
-        # Assert just a small fragments of the response
-        assert "Please reduce the length of the input." in score_response.text
-
-        # Test truncation
-        score_response = requests.post(
-            server.url_for("score"),
-            json={
-                "model": model["name"],
-                "queries": queries,
-                "documents": documents,
-                "truncate_prompt_tokens": 101,
-            },
-        )
-        assert score_response.status_code == 400
-        assert "Please request a smaller truncation size." in score_response.text
-
-    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
-        queries = "What is the capital of France?"
-        documents = "The capital of France is Paris."
-
-        request_args = {
-            "model": model["name"],
-            "queries": queries,
-            "documents": documents,
-        }
-
-        score_response = requests.post(server.url_for("score"), json=request_args)
-        score_response.raise_for_status()
-
-        invocation_response = requests.post(
-            server.url_for("invocations"), json=request_args
-        )
-        invocation_response.raise_for_status()
-
-        score_output = score_response.json()
-        invocation_output = invocation_response.json()
-
-        assert score_output.keys() == invocation_output.keys()
-        for score_data, invocation_data in zip(
-            score_output["data"], invocation_output["data"]
-        ):
-            assert score_data.keys() == invocation_data.keys()
-            assert score_data["score"] == pytest.approx(
-                invocation_data["score"], rel=0.05
-            )
-            # TODO: reset this tolerance to 0.01 once we find
-            # an alternative to flash_attn with bfloat16
-
-    def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
-        def get_outputs(use_activation):
-            queries = "What is the capital of France?"
-            documents = "The capital of France is Paris."
-            response = requests.post(
-                server.url_for("score"),
-                json={
-                    "model": model["name"],
-                    "queries": queries,
-                    "documents": documents,
-                    "use_activation": use_activation,
-                },
-            )
-            outputs = response.json()
-            return torch.tensor([x["score"] for x in outputs["data"]])
-
-        default = get_outputs(use_activation=None)
-        w_activation = get_outputs(use_activation=True)
-        wo_activation = get_outputs(use_activation=False)
-
-        if model["is_cross_encoder"]:
-            assert torch.allclose(default, w_activation, atol=1e-2), (
-                "Default should use activation."
-            )
-            assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
-                "wo_activation should not use activation."
-            )
-            assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
-                "w_activation should be close to activation(wo_activation)."
-            )
--- a/tests/entrypoints/pooling/scoring/init.py
+++ b/tests/entrypoints/pooling/scoring/init.py
--- a/tests/entrypoints/pooling/scoring/test_bi_encoder_offline.py
+++ b/tests/entrypoints/pooling/scoring/test_bi_encoder_offline.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+
+from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+PROMPT = "The chef prepared a delicious meal."
+EMBEDDING_SIZE = 384
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+DTYPE = "half"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+        attention_config=attention_config,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="module")
+def hf_model():
+    return EncoderScoringHfRunner(MODEL_NAME)
+
+
+@pytest.mark.skip_global_cleanup
+def test_1_to_1(llm, hf_model):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    hf_outputs = hf_model.predict([text_pair]).tolist()
+    vllm_outputs = [
+        output.outputs.score for output in llm.score(text_pair[0], text_pair[1])
+    ]
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+
+
+@pytest.mark.skip_global_cleanup
+def test_1_to_n(llm, hf_model):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+    vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)]
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+@pytest.mark.skip_global_cleanup
+def test_n_to_n(llm, hf_model):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+    vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)]
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+def test_embed(llm):
+    outputs = llm.encode(PROMPT, pooling_task="embed", use_tqdm=False)
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs.data) == EMBEDDING_SIZE
--- a/tests/entrypoints/pooling/scoring/test_bi_encoder_online.py
+++ b/tests/entrypoints/pooling/scoring/test_bi_encoder_online.py
@@ -0,0 +1,414 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import requests
+
+from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
+from vllm.platforms import current_platform
+
+MODEL_NAME = "BAAI/bge-base-en-v1.5"
+input_text = "This product was excellent and exceeded my expectations"
+DTYPE = "half"
+EMBEDDING_SIZE = 768
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def hf_model():
+    return EncoderScoringHfRunner(MODEL_NAME)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_1_documents_str_1(
+    hf_model, server: RemoteOpenAIServer
+):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1[0],
+            "documents": TEXTS_2[0],
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_1_documents_str_n(
+    hf_model, server: RemoteOpenAIServer
+):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1[0],
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_n_documents_str_n(
+    hf_model, server: RemoteOpenAIServer
+):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1,
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_vs_documents(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1,
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_vs_items(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1,
+            "items": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_text_1_vs_text_2(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "text_1": TEXTS_1,
+            "text_2": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_data_1_vs_data_2(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "data_1": TEXTS_1,
+            "data_2": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_texts(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": MODEL_NAME,
+            "query": query,
+            "documents": documents,
+        },
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    paris_result = next(r for r in rerank.results if r.index == 1)
+    brazil_result = next(r for r in rerank.results if r.index == 0)
+    assert paris_result.relevance_score > brazil_result.relevance_score
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_top_n(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Cross-encoder models are neat",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": MODEL_NAME, "query": query, "documents": documents, "top_n": 2},
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].index == 1
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_max_model_len(server: RemoteOpenAIServer):
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": MODEL_NAME, "query": query, "documents": documents},
+    )
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input prompt" in rerank_response.text
+
+
+@pytest.mark.asyncio
+async def test_score_api_max_model_len(server: RemoteOpenAIServer):
+    queries = "What is the capital of France?" * 20
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": queries,
+            "documents": documents,
+        },
+    )
+    assert score_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input prompt" in score_response.text
+
+    # Test truncation
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": queries,
+            "documents": documents,
+            "truncate_prompt_tokens": 101,
+        },
+    )
+    assert score_response.status_code == 400
+    assert "Please request a smaller truncation size." in score_response.text
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "query": query,
+        "documents": documents,
+    }
+
+    rerank_response = requests.post(server.url_for("rerank"), json=request_args)
+    rerank_response.raise_for_status()
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    rerank_output = rerank_response.json()
+    invocation_output = invocation_response.json()
+
+    assert rerank_output.keys() == invocation_output.keys()
+    for rerank_result, invocations_result in zip(
+        rerank_output["results"], invocation_output["results"]
+    ):
+        assert rerank_result.keys() == invocations_result.keys()
+        assert rerank_result["relevance_score"] == pytest.approx(
+            invocations_result["relevance_score"], rel=0.01
+        )
+
+
+@pytest.mark.asyncio
+async def test_pooling_embed(server: RemoteOpenAIServer):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "embed",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == EMBEDDING_SIZE
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
+async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")
--- a/tests/entrypoints/pooling/scoring/test_cross_encoder_correctness_mteb.py
+++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_correctness_mteb.py
--- a/tests/entrypoints/pooling/scoring/test_cross_encoder_offline.py
+++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_offline.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from tests.models.utils import softmax
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
+PROMPT = "The chef prepared a delicious meal."
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+        attention_config=attention_config,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    return hf_runner(MODEL_NAME, is_cross_encoder=True)
+
+
+@pytest.mark.skip_global_cleanup
+def test_1_to_1(llm, hf_model):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    hf_outputs = hf_model.predict([text_pair]).tolist()
+    vllm_outputs = [
+        output.outputs.score for output in llm.score(text_pair[0], text_pair[1])
+    ]
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+
+
+@pytest.mark.skip_global_cleanup
+def test_1_to_n(llm, hf_model):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+@pytest.mark.skip_global_cleanup
+def test_n_to_n(llm, hf_model):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+@pytest.mark.skip_global_cleanup
+def test_classify(llm):
+    outputs = llm.encode(PROMPT, pooling_task="classify", use_tqdm=False)
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs.data) == 1
+
+
+def test_pooling_params(llm: LLM):
+    def get_outputs(use_activation):
+        outputs = llm.score(
+            TEXTS_1[0],
+            TEXTS_2[0],
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
+        )
+        return torch.tensor([x.outputs.score for x in outputs])
+
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
--- a/tests/entrypoints/pooling/scoring/test_cross_encoder_online.py
+++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_online.py
@@ -0,0 +1,487 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import requests
+import torch
+import torch.nn.functional as F
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
+from vllm.platforms import current_platform
+
+MODEL_NAME = "BAAI/bge-reranker-base"
+DTYPE = "half"
+input_text = "This product was excellent and exceeded my expectations"
+input_tokens = [0, 3293, 12996, 509, 40881, 136, 204839, 297, 759, 202702, 2]
+
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    return hf_runner(MODEL_NAME, is_cross_encoder=True)
+
+
+@pytest.mark.asyncio
+async def test_basic(server: RemoteOpenAIServer):
+    # test /v1/models
+    response = requests.get(server.url_for("/v1/models"))
+    served_model = response.json()["data"][0]["id"]
+    assert served_model == MODEL_NAME
+
+    # test /tokenize
+    response = requests.post(
+        server.url_for("/tokenize"),
+        json={"model": MODEL_NAME, "prompt": input_text},
+    )
+    assert response.json()["tokens"] == input_tokens
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_1_documents_str_1(
+    hf_model, server: RemoteOpenAIServer
+):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1[0],
+            "documents": TEXTS_2[0],
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_1_documents_str_n(
+    hf_model, server: RemoteOpenAIServer
+):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1[0],
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_n_documents_str_n(
+    hf_model, server: RemoteOpenAIServer
+):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1,
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_vs_documents(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1,
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_vs_items(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1,
+            "items": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_text_1_vs_text_2(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "text_1": TEXTS_1,
+            "text_2": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_data_1_vs_data_2(hf_model, server: RemoteOpenAIServer):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "data_1": TEXTS_1,
+            "data_2": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_texts(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": MODEL_NAME,
+            "query": query,
+            "documents": documents,
+        },
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_top_n(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Cross-encoder models are neat",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": MODEL_NAME, "query": query, "documents": documents, "top_n": 2},
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+    assert rerank.results[0].relevance_score >= 0.9
+    assert rerank.results[1].relevance_score <= 0.01
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_max_model_len(server: RemoteOpenAIServer):
+    query = "What is the capital of France?" * 100
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={"model": MODEL_NAME, "query": query, "documents": documents},
+    )
+    assert rerank_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input prompt" in rerank_response.text
+
+
+@pytest.mark.asyncio
+async def test_score_api_max_model_len(server: RemoteOpenAIServer):
+    queries = "What is the capital of France?" * 20
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": queries,
+            "documents": documents,
+        },
+    )
+    assert score_response.status_code == 400
+    # Assert just a small fragments of the response
+    assert "Please reduce the length of the input prompt" in score_response.text
+
+    # Test truncation
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": queries,
+            "documents": documents,
+            "truncate_prompt_tokens": 101,
+        },
+    )
+    assert score_response.status_code == 400
+    assert "Please request a smaller truncation size." in score_response.text
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer):
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "query": query,
+        "documents": documents,
+    }
+
+    rerank_response = requests.post(server.url_for("rerank"), json=request_args)
+    rerank_response.raise_for_status()
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    rerank_output = rerank_response.json()
+    invocation_output = invocation_response.json()
+
+    assert rerank_output.keys() == invocation_output.keys()
+    for rerank_result, invocations_result in zip(
+        rerank_output["results"], invocation_output["results"]
+    ):
+        assert rerank_result.keys() == invocations_result.keys()
+        assert rerank_result["relevance_score"] == pytest.approx(
+            invocations_result["relevance_score"], rel=0.01
+        )
+
+
+@pytest.mark.asyncio
+async def test_use_activation(server: RemoteOpenAIServer):
+    async def get_outputs(use_activation):
+        query = "What is the capital of France?"
+        documents = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris.",
+        ]
+
+        response = requests.post(
+            server.url_for("rerank"),
+            json={
+                "model": MODEL_NAME,
+                "query": query,
+                "documents": documents,
+                "use_activation": use_activation,
+            },
+        )
+        outputs = response.json()
+
+        return torch.tensor([x["relevance_score"] for x in outputs["results"]])
+
+    default = await get_outputs(use_activation=None)
+    w_activation = await get_outputs(use_activation=True)
+    wo_activation = await get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
+
+
+@pytest.mark.asyncio
+async def test_pooling_classify(server: RemoteOpenAIServer):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "classify",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 1
+
+
+@pytest.mark.asyncio
+async def test_pooling_token_classify(server: RemoteOpenAIServer):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": MODEL_NAME,
+            "task": "token_classify",
+            "input": input_text,
+            "encoding_format": "float",
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == len(input_tokens)
+    assert len(poolings.data[0].data[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")
--- a/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
+++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
@@ -123,7 +123,10 @@ def server(request):
        yield remote_server, backend


-def test_score_api_queries_str_documents_str(server: tuple[RemoteOpenAIServer, str]):
+@pytest.mark.asyncio
+async def test_score_api_queries_str_documents_str(
+    server: tuple[RemoteOpenAIServer, str],
+):
    remote_server, backend = server
    score_response = requests.post(
        remote_server.url_for("score"),
@@ -143,7 +146,8 @@ def test_score_api_queries_str_documents_str(server: tuple[RemoteOpenAIServer, s
    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text")


-def test_score_api_queries_str_documents_text_content(
+@pytest.mark.asyncio
+async def test_score_api_queries_str_documents_text_content(
    server: tuple[RemoteOpenAIServer, str],
 ):
    remote_server, backend = server
@@ -165,7 +169,8 @@ def test_score_api_queries_str_documents_text_content(
    assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text")


-def test_score_api_queries_str_documents_image_url_content(
+@pytest.mark.asyncio
+async def test_score_api_queries_str_documents_image_url_content(
    server: tuple[RemoteOpenAIServer, str],
 ):
    remote_server, backend = server
@@ -187,7 +192,8 @@ def test_score_api_queries_str_documents_image_url_content(
    assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image")


-def test_score_api_queries_str_documents_image_base64_content(
+@pytest.mark.asyncio
+async def test_score_api_queries_str_documents_image_base64_content(
    server: tuple[RemoteOpenAIServer, str],
 ):
    remote_server, backend = server
@@ -209,7 +215,8 @@ def test_score_api_queries_str_documents_image_base64_content(
    assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image_base64")


-def test_score_api_queries_str_documents_image_url_plus_text_content(
+@pytest.mark.asyncio
+async def test_score_api_queries_str_documents_image_url_plus_text_content(
    server: tuple[RemoteOpenAIServer, str],
 ):
    remote_server, backend = server
@@ -233,7 +240,8 @@ def test_score_api_queries_str_documents_image_url_plus_text_content(
    )


-def test_score_api_queries_str_documents_list(
+@pytest.mark.asyncio
+async def test_score_api_queries_str_documents_list(
    server: tuple[RemoteOpenAIServer, str],
 ):
    remote_server, backend = server
@@ -268,7 +276,8 @@ def test_score_api_queries_str_documents_list(
    )


-def test_rerank_api_queries_str_documents_list(
+@pytest.mark.asyncio
+async def test_rerank_api_queries_str_documents_list(
    server: tuple[RemoteOpenAIServer, str],
 ):
    remote_server, backend = server
@@ -320,7 +329,8 @@ def test_rerank_api_queries_str_documents_list(
    )


-def test_score_api_queries_list_documents_list(
+@pytest.mark.asyncio
+async def test_score_api_queries_list_documents_list(
    server: tuple[RemoteOpenAIServer, str],
 ):
    remote_server, backend = server
--- a/tests/entrypoints/pooling/scoring/test_late_interaction_offline.py
+++ b/tests/entrypoints/pooling/scoring/test_late_interaction_offline.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+from .util import ColBERTScoringHfRunner
+
+MODEL_NAME = "answerdotai/answerai-colbert-small-v1"
+COLBERT_DIM = 96
+
+LINEAR_WEIGHTS_KEY = "linear.weight"
+PROMPT = "The chef prepared a delicious meal."
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+DTYPE = "half"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+        attention_config=attention_config,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="module")
+def hf_model():
+    return ColBERTScoringHfRunner(
+        model_name=MODEL_NAME, linear_weights_key=LINEAR_WEIGHTS_KEY
+    )
+
+
+@pytest.mark.skip_global_cleanup
+def test_1_to_1(llm, hf_model):
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    hf_outputs = hf_model.predict([text_pair]).tolist()
+    vllm_outputs = [
+        output.outputs.score for output in llm.score(text_pair[0], text_pair[1])
+    ]
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+
+
+@pytest.mark.skip_global_cleanup
+def test_1_to_n(llm, hf_model):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+    vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)]
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+@pytest.mark.skip_global_cleanup
+def test_n_to_n(llm, hf_model):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+    vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)]
+
+    assert len(vllm_outputs) == 2
+    assert len(hf_outputs) == 2
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
+
+
+def test_token_embed(llm):
+    outputs = llm.encode(PROMPT, pooling_task="token_embed", use_tqdm=False)
+    assert len(outputs) == 1
+    assert outputs[0].outputs.data.shape == (9, COLBERT_DIM)
--- a/tests/entrypoints/pooling/scoring/test_late_interaction_online.py
+++ b/tests/entrypoints/pooling/scoring/test_late_interaction_online.py
@@ -0,0 +1,232 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Online API tests for ColBERT late interaction scoring."""
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
+
+from .util import ColBERTScoringHfRunner
+
+MODEL_NAME = "answerdotai/answerai-colbert-small-v1"
+COLBERT_DIM = 96
+MAX_MODEL_LEN = 512
+LINEAR_WEIGHTS_KEY = "linear.weight"
+
+TEXTS_1 = [
+    "What is the capital of France?",
+    "What is the capital of Germany?",
+]
+
+TEXTS_2 = [
+    "The capital of France is Paris.",
+    "The capital of Germany is Berlin.",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        str(MAX_MODEL_LEN),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def hf_model():
+    return ColBERTScoringHfRunner(
+        model_name=MODEL_NAME, linear_weights_key=LINEAR_WEIGHTS_KEY
+    )
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_1_documents_str_1(
+    hf_model, server: RemoteOpenAIServer
+):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1[0],
+            "documents": TEXTS_2[0],
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_1_documents_str_n(
+    hf_model, server: RemoteOpenAIServer
+):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[0], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1[0],
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_score_api_queries_str_n_documents_str_n(
+    hf_model, server: RemoteOpenAIServer
+):
+    text_pairs = [
+        [TEXTS_1[0], TEXTS_2[0]],
+        [TEXTS_1[1], TEXTS_2[1]],
+    ]
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": TEXTS_1,
+            "documents": TEXTS_2,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 2
+
+    vllm_outputs = [d.score for d in score.data]
+    hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    for i in range(len(vllm_outputs)):
+        assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_texts(server: RemoteOpenAIServer):
+    """Test ColBERT rerank endpoint."""
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": MODEL_NAME,
+            "query": query,
+            "documents": documents,
+        },
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert rerank.id is not None
+    assert rerank.results is not None
+    assert len(rerank.results) == 2
+
+    paris_result = next(r for r in rerank.results if r.index == 1)
+    brazil_result = next(r for r in rerank.results if r.index == 0)
+
+    assert paris_result.relevance_score > brazil_result.relevance_score
+
+
+@pytest.mark.asyncio
+async def test_rerank_api_top_n(server: RemoteOpenAIServer):
+    """Test ColBERT rerank with top_n parameter."""
+    query = "What is the capital of France?"
+    documents = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+        "Machine learning is a field of AI.",
+    ]
+
+    rerank_response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": MODEL_NAME,
+            "query": query,
+            "documents": documents,
+            "top_n": 2,
+        },
+    )
+    rerank_response.raise_for_status()
+    rerank = RerankResponse.model_validate(rerank_response.json())
+
+    assert len(rerank.results) == 2
+    assert rerank.results[0].index == 1
+
+
+@pytest.mark.asyncio
+async def test_token_embed(server: RemoteOpenAIServer):
+    """Test ColBERT token_embed task via pooling endpoint."""
+    text = "What is the capital of France?"
+
+    pooling_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": MODEL_NAME,
+            "input": text,
+            "task": "token_embed",
+        },
+    )
+    pooling_response.raise_for_status()
+    pooling = pooling_response.json()
+
+    assert "data" in pooling
+    assert len(pooling["data"]) == 1
+
+    embeddings = pooling["data"][0]["data"]
+    assert isinstance(embeddings, list)
+    assert len(embeddings) > 0
+    assert len(embeddings[0]) == COLBERT_DIM
+
+
+@pytest.mark.asyncio
+async def test_embed_not_supported(server: RemoteOpenAIServer):
+    """Test that ColBERT model does not support 'embed' task."""
+    task = "embed"
+    text = "What is the capital of France?"
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": MODEL_NAME,
+            "input": text,
+            "task": task,
+        },
+    )
+
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")
--- a/tests/entrypoints/pooling/scoring/test_utils.py
+++ b/tests/entrypoints/pooling/scoring/test_utils.py
--- a/tests/entrypoints/pooling/scoring/util.py
+++ b/tests/entrypoints/pooling/scoring/util.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch.nn.functional as F
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from transformers import AutoModel, AutoTokenizer
+
+from tests.conftest import HfRunner
+from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
+
+
+class ColBERTScoringHfRunner(torch.nn.Module):
+    def __init__(self, model_name, linear_weights_key):
+        super().__init__()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        extra = {}
+        if self.device.type == "cpu":
+            extra["attn_implementation"] = "eager"
+
+        self.model = AutoModel.from_pretrained(
+            model_name,
+            **extra,
+        ).to(self.device)
+        self.model.eval()
+
+        path = hf_hub_download(model_name, filename="model.safetensors")
+        weights = load_file(path)
+
+        self.linear_weight = weights[linear_weights_key].to(self.device).float()
+
+    @torch.inference_mode()
+    def forward(self, texts):
+        embeddings = []
+        for text in texts:
+            inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
+            hidden = self.model(**inputs).last_hidden_state.float()
+            projected = F.linear(hidden, self.linear_weight.float())
+            normalised = F.normalize(projected, p=2, dim=-1)
+            embeddings.append(normalised.squeeze(0).cpu())
+        return embeddings
+
+    @torch.inference_mode()
+    def predict(self, prompts: list[list[str]], *args, **kwargs):
+        hf_embeddings = [self(prompt) for prompt in prompts]
+        hf_outputs = [
+            compute_maxsim_score(*map(torch.tensor, pair)).item()
+            for pair in hf_embeddings
+        ]
+        return torch.as_tensor(hf_outputs)
+
+
+class EncoderScoringHfRunner(HfRunner):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, is_sentence_transformer=True)
+
+    @torch.inference_mode()
+    def predict(self, prompts: list[list[str]], *args, **kwargs):
+        hf_embeddings = [self.encode(prompt) for prompt in prompts]
+        hf_outputs = [
+            F.cosine_similarity(*map(torch.tensor, pair), dim=0)
+            for pair in hf_embeddings
+        ]
+        return torch.as_tensor(hf_outputs)