[CI] Reorganize scoring tests (#38207)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
wang.yuqi
2026-03-26 20:07:01 +08:00
committed by GitHub
parent f2d16207c7
commit dcdc145893
20 changed files with 1595 additions and 975 deletions

View File

@@ -85,7 +85,7 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
expected_message = (
"truncate_prompt_tokens value is "
"greater than max_model_len."
" Please, select a smaller truncation size."
" Please request a smaller truncation size."
)
assert error_details["message"] == expected_message

View File

@@ -288,7 +288,7 @@ async def test_truncate_prompt_tokens(client: openai.AsyncOpenAI, model_name: st
assert "error" in response.object
assert (
"truncate_prompt_tokens value is greater than max_model_len. "
"Please, select a smaller truncation size." in response.message
"Please request a smaller truncation size." in response.message
)

View File

@@ -1,69 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@pytest.fixture(scope="module")
def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
attention_config=attention_config,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
outputs = llm.score(
queries,
documents,
pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False,
)
return torch.tensor([x.outputs.score for x in outputs])
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)

View File

@@ -1,142 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Online API tests for ColBERT late interaction scoring."""
import pytest
import requests
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
MODEL_NAME = "answerdotai/answerai-colbert-small-v1"
COLBERT_DIM = 96
MAX_MODEL_LEN = 512
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
str(MAX_MODEL_LEN),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
class TestColBERTOnline:
def test_rerank(self, server: RemoteOpenAIServer):
"""Test ColBERT rerank endpoint."""
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
paris_result = next(r for r in rerank.results if r.index == 1)
brazil_result = next(r for r in rerank.results if r.index == 0)
assert paris_result.relevance_score > brazil_result.relevance_score
def test_rerank_top_n(self, server: RemoteOpenAIServer):
"""Test ColBERT rerank with top_n parameter."""
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Machine learning is a field of AI.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
"top_n": 2,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert len(rerank.results) == 2
assert rerank.results[0].index == 1
def test_score(self, server: RemoteOpenAIServer):
"""Test ColBERT score endpoint."""
text_1 = "What is the capital of France?"
text_2 = ["The capital of France is Paris.", "Python is a language."]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"text_1": text_1,
"text_2": text_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score > score.data[1].score
def test_token_embed(self, server: RemoteOpenAIServer):
"""Test ColBERT token_embed task via pooling endpoint."""
text = "What is the capital of France?"
pooling_response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": text,
"task": "token_embed",
},
)
pooling_response.raise_for_status()
pooling = pooling_response.json()
assert "data" in pooling
assert len(pooling["data"]) == 1
embeddings = pooling["data"][0]["data"]
assert isinstance(embeddings, list)
assert len(embeddings) > 0
assert len(embeddings[0]) == COLBERT_DIM
def test_embed_not_supported(self, server: RemoteOpenAIServer):
"""Test that ColBERT model does not support 'embed' task."""
task = "embed"
text = "What is the capital of France?"
response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": text,
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(
f"Unsupported task: {task!r}"
)

View File

@@ -1,240 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
import torch
import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse
from vllm.platforms import current_platform
MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16"
input_text = "This product was excellent and exceeded my expectations"
input_tokens = [0, 3293, 12996, 509, 40881, 136, 204839, 297, 759, 202702, 2]
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_basic(server: RemoteOpenAIServer, model_name: str):
# test /v1/models
response = requests.get(server.url_for("/v1/models"))
served_model = response.json()["data"][0]["id"]
assert served_model == MODEL_NAME
# test /tokenize
response = requests.post(
server.url_for("/tokenize"),
json={"model": model_name, "prompt": input_text},
)
assert response.json()["tokens"] == input_tokens
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": model_name,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_top_n(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Cross-encoder models are neat",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": model_name, "query": query, "documents": documents, "top_n": 2},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?" * 100
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": model_name, "query": query, "documents": documents},
)
assert rerank_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in rerank_response.text
def test_invocations(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
request_args = {
"model": MODEL_NAME,
"query": query,
"documents": documents,
}
rerank_response = requests.post(server.url_for("rerank"), json=request_args)
rerank_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
rerank_output = rerank_response.json()
invocation_output = invocation_response.json()
assert rerank_output.keys() == invocation_output.keys()
for rerank_result, invocations_result in zip(
rerank_output["results"], invocation_output["results"]
):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.05
)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
async def get_outputs(use_activation):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
response = requests.post(
server.url_for("rerank"),
json={
"model": model_name,
"query": query,
"documents": documents,
"use_activation": use_activation,
},
)
outputs = response.json()
return torch.tensor([x["relevance_score"] for x in outputs["results"]])
default = await get_outputs(use_activation=None)
w_activation = await get_outputs(use_activation=True)
wo_activation = await get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": "classify",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"task": "token_classify",
"input": input_text,
"encoding_format": "float",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == len(input_tokens)
assert len(poolings.data[0].data[0]) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
async def test_pooling_not_supported(
server: RemoteOpenAIServer, model_name: str, task: str
):
response = requests.post(
server.url_for("pooling"),
json={
"model": model_name,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")

View File

@@ -1,342 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import pytest
import requests
import torch
import torch.nn.functional as F
from torch import tensor
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
from vllm.platforms import current_platform
MODELS = [
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
]
DTYPE = "half"
def run_transformers(hf_model, model, text_pairs):
if model["is_cross_encoder"]:
return hf_model.predict(text_pairs).tolist()
else:
hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs]
return [
F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
for pair in hf_embeddings
]
@pytest.fixture(scope="class", params=MODELS)
def model(request):
yield request.param
@pytest.fixture(scope="class")
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server
@pytest.fixture(scope="class")
def runner(model: dict[str, Any], hf_runner):
kwargs = {
"dtype": DTYPE,
"is_cross_encoder"
if model["is_cross_encoder"]
else "is_sentence_transformer": True,
}
with hf_runner(model["name"], **kwargs) as hf_model:
yield hf_model
class TestModel:
def test_queries_str_documents_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"documents": documents,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[queries, documents]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_queries_str_items_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = "What is the capital of France?"
items = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"items": items,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[queries, items]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_data_1_str_data_2_str(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
data_1 = "What is the capital of France?"
data_2 = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"data_1": data_1,
"data_2": data_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[data_1, data_2]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_queries_str_documents_list(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"documents": documents,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[queries, documents[0]], [queries, documents[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_queries_list_documents_list(
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
):
queries = [
"What is the capital of the United States?",
"What is the capital of France?",
]
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"documents": documents,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[queries[0], documents[0]], [queries[1], documents[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_score_max_model_len(
self, server: RemoteOpenAIServer, model: dict[str, Any]
):
queries = "What is the capital of France?" * 20
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"documents": documents,
},
)
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in score_response.text
# Test truncation
score_response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"documents": documents,
"truncate_prompt_tokens": 101,
},
)
assert score_response.status_code == 400
assert "Please request a smaller truncation size." in score_response.text
def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
request_args = {
"model": model["name"],
"queries": queries,
"documents": documents,
}
score_response = requests.post(server.url_for("score"), json=request_args)
score_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
score_output = score_response.json()
invocation_output = invocation_response.json()
assert score_output.keys() == invocation_output.keys()
for score_data, invocation_data in zip(
score_output["data"], invocation_output["data"]
):
assert score_data.keys() == invocation_data.keys()
assert score_data["score"] == pytest.approx(
invocation_data["score"], rel=0.05
)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]):
def get_outputs(use_activation):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
response = requests.post(
server.url_for("score"),
json={
"model": model["name"],
"queries": queries,
"documents": documents,
"use_activation": use_activation,
},
)
outputs = response.json()
return torch.tensor([x["score"] for x in outputs["data"]])
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
if model["is_cross_encoder"]:
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPT = "The chef prepared a delicious meal."
EMBEDDING_SIZE = 384
TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
]
TEXTS_2 = [
"The capital of France is Paris.",
"The capital of Germany is Berlin.",
]
DTYPE = "half"
@pytest.fixture(scope="module")
def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
attention_config=attention_config,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.fixture(scope="module")
def hf_model():
return EncoderScoringHfRunner(MODEL_NAME)
@pytest.mark.skip_global_cleanup
def test_1_to_1(llm, hf_model):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
hf_outputs = hf_model.predict([text_pair]).tolist()
vllm_outputs = [
output.outputs.score for output in llm.score(text_pair[0], text_pair[1])
]
assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
@pytest.mark.skip_global_cleanup
def test_1_to_n(llm, hf_model):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
hf_outputs = hf_model.predict(text_pairs).tolist()
vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)]
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
@pytest.mark.skip_global_cleanup
def test_n_to_n(llm, hf_model):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
hf_outputs = hf_model.predict(text_pairs).tolist()
vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)]
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_embed(llm):
outputs = llm.encode(PROMPT, pooling_task="embed", use_tqdm=False)
assert len(outputs) == 1
assert len(outputs[0].outputs.data) == EMBEDDING_SIZE

View File

@@ -0,0 +1,414 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.platforms import current_platform
MODEL_NAME = "BAAI/bge-base-en-v1.5"
input_text = "This product was excellent and exceeded my expectations"
DTYPE = "half"
EMBEDDING_SIZE = 768
TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
]
TEXTS_2 = [
"The capital of France is Paris.",
"The capital of Germany is Berlin.",
]
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def hf_model():
return EncoderScoringHfRunner(MODEL_NAME)
@pytest.mark.asyncio
async def test_score_api_queries_str_1_documents_str_1(
hf_model, server: RemoteOpenAIServer
):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1[0],
"documents": TEXTS_2[0],
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_str_1_documents_str_n(
hf_model, server: RemoteOpenAIServer
):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1[0],
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_str_n_documents_str_n(
hf_model, server: RemoteOpenAIServer
):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1,
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_vs_documents(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1,
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_vs_items(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1,
"items": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_text_1_vs_text_2(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"text_1": TEXTS_1,
"text_2": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_data_1_vs_data_2(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"data_1": TEXTS_1,
"data_2": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_rerank_api_texts(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
paris_result = next(r for r in rerank.results if r.index == 1)
brazil_result = next(r for r in rerank.results if r.index == 0)
assert paris_result.relevance_score > brazil_result.relevance_score
@pytest.mark.asyncio
async def test_rerank_api_top_n(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Cross-encoder models are neat",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": MODEL_NAME, "query": query, "documents": documents, "top_n": 2},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].index == 1
@pytest.mark.asyncio
async def test_rerank_api_max_model_len(server: RemoteOpenAIServer):
query = "What is the capital of France?" * 100
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": MODEL_NAME, "query": query, "documents": documents},
)
assert rerank_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input prompt" in rerank_response.text
@pytest.mark.asyncio
async def test_score_api_max_model_len(server: RemoteOpenAIServer):
queries = "What is the capital of France?" * 20
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": queries,
"documents": documents,
},
)
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input prompt" in score_response.text
# Test truncation
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": queries,
"documents": documents,
"truncate_prompt_tokens": 101,
},
)
assert score_response.status_code == 400
assert "Please request a smaller truncation size." in score_response.text
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
request_args = {
"model": MODEL_NAME,
"query": query,
"documents": documents,
}
rerank_response = requests.post(server.url_for("rerank"), json=request_args)
rerank_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
rerank_output = rerank_response.json()
invocation_output = invocation_response.json()
assert rerank_output.keys() == invocation_output.keys()
for rerank_result, invocations_result in zip(
rerank_output["results"], invocation_output["results"]
):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.01
)
@pytest.mark.asyncio
async def test_pooling_embed(server: RemoteOpenAIServer):
response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"task": "embed",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == EMBEDDING_SIZE
@pytest.mark.asyncio
@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str):
response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")

View File

@@ -0,0 +1,137 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
PROMPT = "The chef prepared a delicious meal."
TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
]
TEXTS_2 = [
"The capital of France is Paris.",
"The capital of Germany is Berlin.",
]
@pytest.fixture(scope="module")
def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
attention_config=attention_config,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.fixture(scope="module")
def hf_model(hf_runner):
return hf_runner(MODEL_NAME, is_cross_encoder=True)
@pytest.mark.skip_global_cleanup
def test_1_to_1(llm, hf_model):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
hf_outputs = hf_model.predict([text_pair]).tolist()
vllm_outputs = [
output.outputs.score for output in llm.score(text_pair[0], text_pair[1])
]
assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
@pytest.mark.skip_global_cleanup
def test_1_to_n(llm, hf_model):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)]
hf_outputs = hf_model.predict(text_pairs).tolist()
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
@pytest.mark.skip_global_cleanup
def test_n_to_n(llm, hf_model):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)]
hf_outputs = hf_model.predict(text_pairs).tolist()
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
@pytest.mark.skip_global_cleanup
def test_classify(llm):
outputs = llm.encode(PROMPT, pooling_task="classify", use_tqdm=False)
assert len(outputs) == 1
assert len(outputs[0].outputs.data) == 1
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
outputs = llm.score(
TEXTS_1[0],
TEXTS_2[0],
pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False,
)
return torch.tensor([x.outputs.score for x in outputs])
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)

View File

@@ -0,0 +1,487 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
import torch
import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from vllm.platforms import current_platform
MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "half"
input_text = "This product was excellent and exceeded my expectations"
input_tokens = [0, 3293, 12996, 509, 40881, 136, 204839, 297, 759, 202702, 2]
TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
]
TEXTS_2 = [
"The capital of France is Paris.",
"The capital of Germany is Berlin.",
]
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def hf_model(hf_runner):
return hf_runner(MODEL_NAME, is_cross_encoder=True)
@pytest.mark.asyncio
async def test_basic(server: RemoteOpenAIServer):
# test /v1/models
response = requests.get(server.url_for("/v1/models"))
served_model = response.json()["data"][0]["id"]
assert served_model == MODEL_NAME
# test /tokenize
response = requests.post(
server.url_for("/tokenize"),
json={"model": MODEL_NAME, "prompt": input_text},
)
assert response.json()["tokens"] == input_tokens
@pytest.mark.asyncio
async def test_score_api_queries_str_1_documents_str_1(
hf_model, server: RemoteOpenAIServer
):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1[0],
"documents": TEXTS_2[0],
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_str_1_documents_str_n(
hf_model, server: RemoteOpenAIServer
):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1[0],
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_str_n_documents_str_n(
hf_model, server: RemoteOpenAIServer
):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1,
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_vs_documents(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1,
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_vs_items(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1,
"items": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_text_1_vs_text_2(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"text_1": TEXTS_1,
"text_2": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_data_1_vs_data_2(hf_model, server: RemoteOpenAIServer):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"data_1": TEXTS_1,
"data_2": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_rerank_api_texts(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
async def test_rerank_api_top_n(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Cross-encoder models are neat",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": MODEL_NAME, "query": query, "documents": documents, "top_n": 2},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
async def test_rerank_api_max_model_len(server: RemoteOpenAIServer):
query = "What is the capital of France?" * 100
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={"model": MODEL_NAME, "query": query, "documents": documents},
)
assert rerank_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input prompt" in rerank_response.text
@pytest.mark.asyncio
async def test_score_api_max_model_len(server: RemoteOpenAIServer):
queries = "What is the capital of France?" * 20
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": queries,
"documents": documents,
},
)
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input prompt" in score_response.text
# Test truncation
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": queries,
"documents": documents,
"truncate_prompt_tokens": 101,
},
)
assert score_response.status_code == 400
assert "Please request a smaller truncation size." in score_response.text
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
request_args = {
"model": MODEL_NAME,
"query": query,
"documents": documents,
}
rerank_response = requests.post(server.url_for("rerank"), json=request_args)
rerank_response.raise_for_status()
invocation_response = requests.post(
server.url_for("invocations"), json=request_args
)
invocation_response.raise_for_status()
rerank_output = rerank_response.json()
invocation_output = invocation_response.json()
assert rerank_output.keys() == invocation_output.keys()
for rerank_result, invocations_result in zip(
rerank_output["results"], invocation_output["results"]
):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.01
)
@pytest.mark.asyncio
async def test_use_activation(server: RemoteOpenAIServer):
async def get_outputs(use_activation):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
"use_activation": use_activation,
},
)
outputs = response.json()
return torch.tensor([x["relevance_score"] for x in outputs["results"]])
default = await get_outputs(use_activation=None)
w_activation = await get_outputs(use_activation=True)
wo_activation = await get_outputs(use_activation=False)
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
"wo_activation should not use activation."
)
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
@pytest.mark.asyncio
async def test_pooling_classify(server: RemoteOpenAIServer):
response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"task": "classify",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == 1
@pytest.mark.asyncio
async def test_pooling_token_classify(server: RemoteOpenAIServer):
response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"task": "token_classify",
"input": input_text,
"encoding_format": "float",
},
)
poolings = PoolingResponse.model_validate(response.json())
assert len(poolings.data) == 1
assert len(poolings.data[0].data) == len(input_tokens)
assert len(poolings.data[0].data[0]) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str):
response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")

View File

@@ -123,7 +123,10 @@ def server(request):
yield remote_server, backend
def test_score_api_queries_str_documents_str(server: tuple[RemoteOpenAIServer, str]):
@pytest.mark.asyncio
async def test_score_api_queries_str_documents_str(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server
score_response = requests.post(
remote_server.url_for("score"),
@@ -143,7 +146,8 @@ def test_score_api_queries_str_documents_str(server: tuple[RemoteOpenAIServer, s
assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text")
def test_score_api_queries_str_documents_text_content(
@pytest.mark.asyncio
async def test_score_api_queries_str_documents_text_content(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server
@@ -165,7 +169,8 @@ def test_score_api_queries_str_documents_text_content(
assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text")
def test_score_api_queries_str_documents_image_url_content(
@pytest.mark.asyncio
async def test_score_api_queries_str_documents_image_url_content(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server
@@ -187,7 +192,8 @@ def test_score_api_queries_str_documents_image_url_content(
assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image")
def test_score_api_queries_str_documents_image_base64_content(
@pytest.mark.asyncio
async def test_score_api_queries_str_documents_image_base64_content(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server
@@ -209,7 +215,8 @@ def test_score_api_queries_str_documents_image_base64_content(
assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image_base64")
def test_score_api_queries_str_documents_image_url_plus_text_content(
@pytest.mark.asyncio
async def test_score_api_queries_str_documents_image_url_plus_text_content(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server
@@ -233,7 +240,8 @@ def test_score_api_queries_str_documents_image_url_plus_text_content(
)
def test_score_api_queries_str_documents_list(
@pytest.mark.asyncio
async def test_score_api_queries_str_documents_list(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server
@@ -268,7 +276,8 @@ def test_score_api_queries_str_documents_list(
)
def test_rerank_api_queries_str_documents_list(
@pytest.mark.asyncio
async def test_rerank_api_queries_str_documents_list(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server
@@ -320,7 +329,8 @@ def test_rerank_api_queries_str_documents_list(
)
def test_score_api_queries_list_documents_list(
@pytest.mark.asyncio
async def test_score_api_queries_list_documents_list(
server: tuple[RemoteOpenAIServer, str],
):
remote_server, backend = server

View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
from .util import ColBERTScoringHfRunner
MODEL_NAME = "answerdotai/answerai-colbert-small-v1"
COLBERT_DIM = 96
LINEAR_WEIGHTS_KEY = "linear.weight"
PROMPT = "The chef prepared a delicious meal."
TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
]
TEXTS_2 = [
"The capital of France is Paris.",
"The capital of Germany is Berlin.",
]
DTYPE = "half"
@pytest.fixture(scope="module")
def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
attention_config=attention_config,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.fixture(scope="module")
def hf_model():
return ColBERTScoringHfRunner(
model_name=MODEL_NAME, linear_weights_key=LINEAR_WEIGHTS_KEY
)
@pytest.mark.skip_global_cleanup
def test_1_to_1(llm, hf_model):
text_pair = [TEXTS_1[0], TEXTS_2[0]]
hf_outputs = hf_model.predict([text_pair]).tolist()
vllm_outputs = [
output.outputs.score for output in llm.score(text_pair[0], text_pair[1])
]
assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
@pytest.mark.skip_global_cleanup
def test_1_to_n(llm, hf_model):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
hf_outputs = hf_model.predict(text_pairs).tolist()
vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)]
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
@pytest.mark.skip_global_cleanup
def test_n_to_n(llm, hf_model):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
hf_outputs = hf_model.predict(text_pairs).tolist()
vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)]
assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_token_embed(llm):
outputs = llm.encode(PROMPT, pooling_task="token_embed", use_tqdm=False)
assert len(outputs) == 1
assert outputs[0].outputs.data.shape == (9, COLBERT_DIM)

View File

@@ -0,0 +1,232 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Online API tests for ColBERT late interaction scoring."""
import pytest
import requests
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
from .util import ColBERTScoringHfRunner
MODEL_NAME = "answerdotai/answerai-colbert-small-v1"
COLBERT_DIM = 96
MAX_MODEL_LEN = 512
LINEAR_WEIGHTS_KEY = "linear.weight"
TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
]
TEXTS_2 = [
"The capital of France is Paris.",
"The capital of Germany is Berlin.",
]
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
str(MAX_MODEL_LEN),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def hf_model():
return ColBERTScoringHfRunner(
model_name=MODEL_NAME, linear_weights_key=LINEAR_WEIGHTS_KEY
)
@pytest.mark.asyncio
async def test_score_api_queries_str_1_documents_str_1(
hf_model, server: RemoteOpenAIServer
):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1[0],
"documents": TEXTS_2[0],
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_str_1_documents_str_n(
hf_model, server: RemoteOpenAIServer
):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1[0],
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_score_api_queries_str_n_documents_str_n(
hf_model, server: RemoteOpenAIServer
):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": TEXTS_1,
"documents": TEXTS_2,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
hf_outputs = hf_model.predict(text_pairs).tolist()
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
@pytest.mark.asyncio
async def test_rerank_api_texts(server: RemoteOpenAIServer):
"""Test ColBERT rerank endpoint."""
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
paris_result = next(r for r in rerank.results if r.index == 1)
brazil_result = next(r for r in rerank.results if r.index == 0)
assert paris_result.relevance_score > brazil_result.relevance_score
@pytest.mark.asyncio
async def test_rerank_api_top_n(server: RemoteOpenAIServer):
"""Test ColBERT rerank with top_n parameter."""
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
"Machine learning is a field of AI.",
]
rerank_response = requests.post(
server.url_for("rerank"),
json={
"model": MODEL_NAME,
"query": query,
"documents": documents,
"top_n": 2,
},
)
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert len(rerank.results) == 2
assert rerank.results[0].index == 1
@pytest.mark.asyncio
async def test_token_embed(server: RemoteOpenAIServer):
"""Test ColBERT token_embed task via pooling endpoint."""
text = "What is the capital of France?"
pooling_response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": text,
"task": "token_embed",
},
)
pooling_response.raise_for_status()
pooling = pooling_response.json()
assert "data" in pooling
assert len(pooling["data"]) == 1
embeddings = pooling["data"][0]["data"]
assert isinstance(embeddings, list)
assert len(embeddings) > 0
assert len(embeddings[0]) == COLBERT_DIM
@pytest.mark.asyncio
async def test_embed_not_supported(server: RemoteOpenAIServer):
"""Test that ColBERT model does not support 'embed' task."""
task = "embed"
text = "What is the capital of France?"
response = requests.post(
server.url_for("pooling"),
json={
"model": MODEL_NAME,
"input": text,
"task": task,
},
)
assert response.json()["error"]["type"] == "BadRequestError"
assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}")

View File

@@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.nn.functional as F
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from transformers import AutoModel, AutoTokenizer
from tests.conftest import HfRunner
from vllm.entrypoints.pooling.score.utils import compute_maxsim_score
class ColBERTScoringHfRunner(torch.nn.Module):
def __init__(self, model_name, linear_weights_key):
super().__init__()
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
extra = {}
if self.device.type == "cpu":
extra["attn_implementation"] = "eager"
self.model = AutoModel.from_pretrained(
model_name,
**extra,
).to(self.device)
self.model.eval()
path = hf_hub_download(model_name, filename="model.safetensors")
weights = load_file(path)
self.linear_weight = weights[linear_weights_key].to(self.device).float()
@torch.inference_mode()
def forward(self, texts):
embeddings = []
for text in texts:
inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
hidden = self.model(**inputs).last_hidden_state.float()
projected = F.linear(hidden, self.linear_weight.float())
normalised = F.normalize(projected, p=2, dim=-1)
embeddings.append(normalised.squeeze(0).cpu())
return embeddings
@torch.inference_mode()
def predict(self, prompts: list[list[str]], *args, **kwargs):
hf_embeddings = [self(prompt) for prompt in prompts]
hf_outputs = [
compute_maxsim_score(*map(torch.tensor, pair)).item()
for pair in hf_embeddings
]
return torch.as_tensor(hf_outputs)
class EncoderScoringHfRunner(HfRunner):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, is_sentence_transformer=True)
@torch.inference_mode()
def predict(self, prompts: list[list[str]], *args, **kwargs):
hf_embeddings = [self.encode(prompt) for prompt in prompts]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]
return torch.as_tensor(hf_outputs)