diff --git a/tests/entrypoints/pooling/basic/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py index fcaead0e2..54cf08349 100644 --- a/tests/entrypoints/pooling/basic/test_truncation.py +++ b/tests/entrypoints/pooling/basic/test_truncation.py @@ -85,7 +85,7 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI): expected_message = ( "truncate_prompt_tokens value is " "greater than max_model_len." - " Please, select a smaller truncation size." + " Please request a smaller truncation size." ) assert error_details["message"] == expected_message diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index 56ab09bc7..dc61244c9 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -288,7 +288,7 @@ async def test_truncate_prompt_tokens(client: openai.AsyncOpenAI, model_name: st assert "error" in response.object assert ( "truncate_prompt_tokens value is greater than max_model_len. " - "Please, select a smaller truncation size." in response.message + "Please request a smaller truncation size." in response.message ) diff --git a/tests/entrypoints/pooling/score/test_offline.py b/tests/entrypoints/pooling/score/test_offline.py deleted file mode 100644 index 4964d94e6..000000000 --- a/tests/entrypoints/pooling/score/test_offline.py +++ /dev/null @@ -1,69 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import weakref - -import pytest -import torch - -from tests.models.utils import softmax -from vllm import LLM, PoolingParams -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.platforms import current_platform - -MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" - - -@pytest.fixture(scope="module") -def llm(): - # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend - # that supports encoder-only models on ROCm. - attention_config = None - if current_platform.is_rocm(): - attention_config = {"backend": "FLEX_ATTENTION"} - - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM( - model=MODEL_NAME, - max_num_batched_tokens=32768, - tensor_parallel_size=1, - gpu_memory_utilization=0.75, - enforce_eager=True, - seed=0, - attention_config=attention_config, - ) - - yield weakref.proxy(llm) - - del llm - - cleanup_dist_env_and_memory() - - -def test_pooling_params(llm: LLM): - def get_outputs(use_activation): - queries = "What is the capital of France?" - documents = "The capital of France is Paris." - - outputs = llm.score( - queries, - documents, - pooling_params=PoolingParams(use_activation=use_activation), - use_tqdm=False, - ) - return torch.tensor([x.outputs.score for x in outputs]) - - default = get_outputs(use_activation=None) - w_activation = get_outputs(use_activation=True) - wo_activation = get_outputs(use_activation=False) - - assert torch.allclose(default, w_activation, atol=1e-2), ( - "Default should use activation." - ) - assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( - "wo_activation should not use activation." - ) - assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), ( - "w_activation should be close to activation(wo_activation)." - ) diff --git a/tests/entrypoints/pooling/score/test_online_colbert.py b/tests/entrypoints/pooling/score/test_online_colbert.py deleted file mode 100644 index ac79ff0b9..000000000 --- a/tests/entrypoints/pooling/score/test_online_colbert.py +++ /dev/null @@ -1,142 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Online API tests for ColBERT late interaction scoring.""" - -import pytest -import requests - -from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse - -MODEL_NAME = "answerdotai/answerai-colbert-small-v1" -COLBERT_DIM = 96 -MAX_MODEL_LEN = 512 - - -@pytest.fixture(scope="module") -def server(): - args = [ - "--max-model-len", - str(MAX_MODEL_LEN), - ] - - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server - - -class TestColBERTOnline: - def test_rerank(self, server: RemoteOpenAIServer): - """Test ColBERT rerank endpoint.""" - query = "What is the capital of France?" - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - rerank_response = requests.post( - server.url_for("rerank"), - json={ - "model": MODEL_NAME, - "query": query, - "documents": documents, - }, - ) - rerank_response.raise_for_status() - rerank = RerankResponse.model_validate(rerank_response.json()) - - assert rerank.id is not None - assert rerank.results is not None - assert len(rerank.results) == 2 - - paris_result = next(r for r in rerank.results if r.index == 1) - brazil_result = next(r for r in rerank.results if r.index == 0) - - assert paris_result.relevance_score > brazil_result.relevance_score - - def test_rerank_top_n(self, server: RemoteOpenAIServer): - """Test ColBERT rerank with top_n parameter.""" - query = "What is the capital of France?" - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - "Machine learning is a field of AI.", - ] - - rerank_response = requests.post( - server.url_for("rerank"), - json={ - "model": MODEL_NAME, - "query": query, - "documents": documents, - "top_n": 2, - }, - ) - rerank_response.raise_for_status() - rerank = RerankResponse.model_validate(rerank_response.json()) - - assert len(rerank.results) == 2 - assert rerank.results[0].index == 1 - - def test_score(self, server: RemoteOpenAIServer): - """Test ColBERT score endpoint.""" - text_1 = "What is the capital of France?" - text_2 = ["The capital of France is Paris.", "Python is a language."] - - score_response = requests.post( - server.url_for("score"), - json={ - "model": MODEL_NAME, - "text_1": text_1, - "text_2": text_2, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 2 - - assert score.data[0].score > score.data[1].score - - def test_token_embed(self, server: RemoteOpenAIServer): - """Test ColBERT token_embed task via pooling endpoint.""" - text = "What is the capital of France?" - - pooling_response = requests.post( - server.url_for("pooling"), - json={ - "model": MODEL_NAME, - "input": text, - "task": "token_embed", - }, - ) - pooling_response.raise_for_status() - pooling = pooling_response.json() - - assert "data" in pooling - assert len(pooling["data"]) == 1 - - embeddings = pooling["data"][0]["data"] - assert isinstance(embeddings, list) - assert len(embeddings) > 0 - assert len(embeddings[0]) == COLBERT_DIM - - def test_embed_not_supported(self, server: RemoteOpenAIServer): - """Test that ColBERT model does not support 'embed' task.""" - task = "embed" - text = "What is the capital of France?" - - response = requests.post( - server.url_for("pooling"), - json={ - "model": MODEL_NAME, - "input": text, - "task": task, - }, - ) - - assert response.json()["error"]["type"] == "BadRequestError" - assert response.json()["error"]["message"].startswith( - f"Unsupported task: {task!r}" - ) diff --git a/tests/entrypoints/pooling/score/test_online_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py deleted file mode 100644 index a59d2cfa9..000000000 --- a/tests/entrypoints/pooling/score/test_online_rerank.py +++ /dev/null @@ -1,240 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import requests -import torch -import torch.nn.functional as F - -from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse -from vllm.entrypoints.pooling.score.protocol import RerankResponse -from vllm.platforms import current_platform - -MODEL_NAME = "BAAI/bge-reranker-base" -DTYPE = "bfloat16" -input_text = "This product was excellent and exceeded my expectations" -input_tokens = [0, 3293, 12996, 509, 40881, 136, 204839, 297, 759, 202702, 2] - - -@pytest.fixture(scope="module") -def server(): - args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] - - # ROCm: Use Flex Attention to support encoder-only self-attention. - if current_platform.is_rocm(): - args.extend(["--attention-backend", "FLEX_ATTENTION"]) - - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server - - -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_basic(server: RemoteOpenAIServer, model_name: str): - # test /v1/models - response = requests.get(server.url_for("/v1/models")) - served_model = response.json()["data"][0]["id"] - assert served_model == MODEL_NAME - - # test /tokenize - response = requests.post( - server.url_for("/tokenize"), - json={"model": model_name, "prompt": input_text}, - ) - assert response.json()["tokens"] == input_tokens - - -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_rerank_texts(server: RemoteOpenAIServer, model_name: str): - query = "What is the capital of France?" - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - rerank_response = requests.post( - server.url_for("rerank"), - json={ - "model": model_name, - "query": query, - "documents": documents, - }, - ) - rerank_response.raise_for_status() - rerank = RerankResponse.model_validate(rerank_response.json()) - - assert rerank.id is not None - assert rerank.results is not None - assert len(rerank.results) == 2 - assert rerank.results[0].relevance_score >= 0.9 - assert rerank.results[1].relevance_score <= 0.01 - - -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_top_n(server: RemoteOpenAIServer, model_name: str): - query = "What is the capital of France?" - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - "Cross-encoder models are neat", - ] - - rerank_response = requests.post( - server.url_for("rerank"), - json={"model": model_name, "query": query, "documents": documents, "top_n": 2}, - ) - rerank_response.raise_for_status() - rerank = RerankResponse.model_validate(rerank_response.json()) - - assert rerank.id is not None - assert rerank.results is not None - assert len(rerank.results) == 2 - assert rerank.results[0].relevance_score >= 0.9 - assert rerank.results[1].relevance_score <= 0.01 - - -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str): - query = "What is the capital of France?" * 100 - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - rerank_response = requests.post( - server.url_for("rerank"), - json={"model": model_name, "query": query, "documents": documents}, - ) - assert rerank_response.status_code == 400 - # Assert just a small fragments of the response - assert "Please reduce the length of the input." in rerank_response.text - - -def test_invocations(server: RemoteOpenAIServer): - query = "What is the capital of France?" - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - request_args = { - "model": MODEL_NAME, - "query": query, - "documents": documents, - } - - rerank_response = requests.post(server.url_for("rerank"), json=request_args) - rerank_response.raise_for_status() - - invocation_response = requests.post( - server.url_for("invocations"), json=request_args - ) - invocation_response.raise_for_status() - - rerank_output = rerank_response.json() - invocation_output = invocation_response.json() - - assert rerank_output.keys() == invocation_output.keys() - for rerank_result, invocations_result in zip( - rerank_output["results"], invocation_output["results"] - ): - assert rerank_result.keys() == invocations_result.keys() - assert rerank_result["relevance_score"] == pytest.approx( - invocations_result["relevance_score"], rel=0.05 - ) - # TODO: reset this tolerance to 0.01 once we find - # an alternative to flash_attn with bfloat16 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_use_activation(server: RemoteOpenAIServer, model_name: str): - async def get_outputs(use_activation): - query = "What is the capital of France?" - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - response = requests.post( - server.url_for("rerank"), - json={ - "model": model_name, - "query": query, - "documents": documents, - "use_activation": use_activation, - }, - ) - outputs = response.json() - - return torch.tensor([x["relevance_score"] for x in outputs["results"]]) - - default = await get_outputs(use_activation=None) - w_activation = await get_outputs(use_activation=True) - wo_activation = await get_outputs(use_activation=False) - - assert torch.allclose(default, w_activation, atol=1e-2), ( - "Default should use activation." - ) - assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( - "wo_activation should not use activation." - ) - assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( - "w_activation should be close to activation(wo_activation)." - ) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str): - response = requests.post( - server.url_for("pooling"), - json={ - "model": model_name, - "input": input_text, - "encoding_format": "float", - "task": "classify", - }, - ) - poolings = PoolingResponse.model_validate(response.json()) - assert len(poolings.data) == 1 - assert len(poolings.data[0].data) == 1 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str): - response = requests.post( - server.url_for("pooling"), - json={ - "model": model_name, - "task": "token_classify", - "input": input_text, - "encoding_format": "float", - }, - ) - - poolings = PoolingResponse.model_validate(response.json()) - - assert len(poolings.data) == 1 - assert len(poolings.data[0].data) == len(input_tokens) - assert len(poolings.data[0].data[0]) == 1 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) -async def test_pooling_not_supported( - server: RemoteOpenAIServer, model_name: str, task: str -): - response = requests.post( - server.url_for("pooling"), - json={ - "model": model_name, - "input": input_text, - "encoding_format": "float", - "task": task, - }, - ) - assert response.json()["error"]["type"] == "BadRequestError" - assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}") diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py deleted file mode 100644 index c8b334778..000000000 --- a/tests/entrypoints/pooling/score/test_online_score.py +++ /dev/null @@ -1,342 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any - -import pytest -import requests -import torch -import torch.nn.functional as F -from torch import tensor - -from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.pooling.score.protocol import ScoreResponse -from vllm.platforms import current_platform - -MODELS = [ - {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True}, - {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False}, -] -DTYPE = "half" - - -def run_transformers(hf_model, model, text_pairs): - if model["is_cross_encoder"]: - return hf_model.predict(text_pairs).tolist() - else: - hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs] - return [ - F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0) - for pair in hf_embeddings - ] - - -@pytest.fixture(scope="class", params=MODELS) -def model(request): - yield request.param - - -@pytest.fixture(scope="class") -def server(model: dict[str, Any]): - args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] - - # ROCm: Use Flex Attention to support encoder-only self-attention. - if current_platform.is_rocm(): - args.extend(["--attention-backend", "FLEX_ATTENTION"]) - - with RemoteOpenAIServer(model["name"], args) as remote_server: - yield remote_server - - -@pytest.fixture(scope="class") -def runner(model: dict[str, Any], hf_runner): - kwargs = { - "dtype": DTYPE, - "is_cross_encoder" - if model["is_cross_encoder"] - else "is_sentence_transformer": True, - } - - with hf_runner(model["name"], **kwargs) as hf_model: - yield hf_model - - -class TestModel: - def test_queries_str_documents_str( - self, server: RemoteOpenAIServer, model: dict[str, Any], runner - ): - queries = "What is the capital of France?" - documents = "The capital of France is Paris." - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "queries": queries, - "documents": documents, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 1 - - vllm_outputs = [d.score for d in score.data] - - text_pairs = [[queries, documents]] - hf_outputs = run_transformers(runner, model, text_pairs) - - for i in range(len(vllm_outputs)): - assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - - def test_queries_str_items_str( - self, server: RemoteOpenAIServer, model: dict[str, Any], runner - ): - queries = "What is the capital of France?" - items = "The capital of France is Paris." - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "queries": queries, - "items": items, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 1 - - vllm_outputs = [d.score for d in score.data] - - text_pairs = [[queries, items]] - hf_outputs = run_transformers(runner, model, text_pairs) - - for i in range(len(vllm_outputs)): - assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - - def test_text_1_str_text_2_str( - self, server: RemoteOpenAIServer, model: dict[str, Any], runner - ): - text_1 = "What is the capital of France?" - text_2 = "The capital of France is Paris." - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "text_1": text_1, - "text_2": text_2, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 1 - - vllm_outputs = [d.score for d in score.data] - - text_pairs = [[text_1, text_2]] - hf_outputs = run_transformers(runner, model, text_pairs) - - for i in range(len(vllm_outputs)): - assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - - def test_data_1_str_data_2_str( - self, server: RemoteOpenAIServer, model: dict[str, Any], runner - ): - data_1 = "What is the capital of France?" - data_2 = "The capital of France is Paris." - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "data_1": data_1, - "data_2": data_2, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 1 - - vllm_outputs = [d.score for d in score.data] - - text_pairs = [[data_1, data_2]] - hf_outputs = run_transformers(runner, model, text_pairs) - - for i in range(len(vllm_outputs)): - assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - - def test_queries_str_documents_list( - self, server: RemoteOpenAIServer, model: dict[str, Any], runner - ): - queries = "What is the capital of France?" - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "queries": queries, - "documents": documents, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 2 - - vllm_outputs = [d.score for d in score.data] - - text_pairs = [[queries, documents[0]], [queries, documents[1]]] - hf_outputs = run_transformers(runner, model, text_pairs) - - for i in range(len(vllm_outputs)): - assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - - def test_queries_list_documents_list( - self, server: RemoteOpenAIServer, model: dict[str, Any], runner - ): - queries = [ - "What is the capital of the United States?", - "What is the capital of France?", - ] - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "queries": queries, - "documents": documents, - }, - ) - score_response.raise_for_status() - score = ScoreResponse.model_validate(score_response.json()) - - assert score.id is not None - assert score.data is not None - assert len(score.data) == 2 - - vllm_outputs = [d.score for d in score.data] - - text_pairs = [[queries[0], documents[0]], [queries[1], documents[1]]] - hf_outputs = run_transformers(runner, model, text_pairs) - - for i in range(len(vllm_outputs)): - assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) - - def test_score_max_model_len( - self, server: RemoteOpenAIServer, model: dict[str, Any] - ): - queries = "What is the capital of France?" * 20 - documents = [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", - ] - - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "queries": queries, - "documents": documents, - }, - ) - assert score_response.status_code == 400 - # Assert just a small fragments of the response - assert "Please reduce the length of the input." in score_response.text - - # Test truncation - score_response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "queries": queries, - "documents": documents, - "truncate_prompt_tokens": 101, - }, - ) - assert score_response.status_code == 400 - assert "Please request a smaller truncation size." in score_response.text - - def test_invocations(self, server: RemoteOpenAIServer, model: dict[str, Any]): - queries = "What is the capital of France?" - documents = "The capital of France is Paris." - - request_args = { - "model": model["name"], - "queries": queries, - "documents": documents, - } - - score_response = requests.post(server.url_for("score"), json=request_args) - score_response.raise_for_status() - - invocation_response = requests.post( - server.url_for("invocations"), json=request_args - ) - invocation_response.raise_for_status() - - score_output = score_response.json() - invocation_output = invocation_response.json() - - assert score_output.keys() == invocation_output.keys() - for score_data, invocation_data in zip( - score_output["data"], invocation_output["data"] - ): - assert score_data.keys() == invocation_data.keys() - assert score_data["score"] == pytest.approx( - invocation_data["score"], rel=0.05 - ) - # TODO: reset this tolerance to 0.01 once we find - # an alternative to flash_attn with bfloat16 - - def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): - def get_outputs(use_activation): - queries = "What is the capital of France?" - documents = "The capital of France is Paris." - response = requests.post( - server.url_for("score"), - json={ - "model": model["name"], - "queries": queries, - "documents": documents, - "use_activation": use_activation, - }, - ) - outputs = response.json() - return torch.tensor([x["score"] for x in outputs["data"]]) - - default = get_outputs(use_activation=None) - w_activation = get_outputs(use_activation=True) - wo_activation = get_outputs(use_activation=False) - - if model["is_cross_encoder"]: - assert torch.allclose(default, w_activation, atol=1e-2), ( - "Default should use activation." - ) - assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( - "wo_activation should not use activation." - ) - assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( - "w_activation should be close to activation(wo_activation)." - ) diff --git a/tests/entrypoints/pooling/score/__init__.py b/tests/entrypoints/pooling/scoring/__init__.py similarity index 100% rename from tests/entrypoints/pooling/score/__init__.py rename to tests/entrypoints/pooling/scoring/__init__.py diff --git a/tests/entrypoints/pooling/scoring/test_bi_encoder_offline.py b/tests/entrypoints/pooling/scoring/test_bi_encoder_offline.py new file mode 100644 index 000000000..aba70293f --- /dev/null +++ b/tests/entrypoints/pooling/scoring/test_bi_encoder_offline.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import weakref + +import pytest + +from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner +from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +MODEL_NAME = "intfloat/multilingual-e5-small" +PROMPT = "The chef prepared a delicious meal." +EMBEDDING_SIZE = 384 + +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + +DTYPE = "half" + + +@pytest.fixture(scope="module") +def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM( + model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True, + seed=0, + attention_config=attention_config, + ) + + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def hf_model(): + return EncoderScoringHfRunner(MODEL_NAME) + + +@pytest.mark.skip_global_cleanup +def test_1_to_1(llm, hf_model): + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + hf_outputs = hf_model.predict([text_pair]).tolist() + vllm_outputs = [ + output.outputs.score for output in llm.score(text_pair[0], text_pair[1]) + ] + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + + +@pytest.mark.skip_global_cleanup +def test_1_to_n(llm, hf_model): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + hf_outputs = hf_model.predict(text_pairs).tolist() + vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)] + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) + + +@pytest.mark.skip_global_cleanup +def test_n_to_n(llm, hf_model): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + hf_outputs = hf_model.predict(text_pairs).tolist() + vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)] + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) + + +def test_embed(llm): + outputs = llm.encode(PROMPT, pooling_task="embed", use_tqdm=False) + assert len(outputs) == 1 + assert len(outputs[0].outputs.data) == EMBEDDING_SIZE diff --git a/tests/entrypoints/pooling/scoring/test_bi_encoder_online.py b/tests/entrypoints/pooling/scoring/test_bi_encoder_online.py new file mode 100644 index 000000000..fb925836f --- /dev/null +++ b/tests/entrypoints/pooling/scoring/test_bi_encoder_online.py @@ -0,0 +1,414 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import requests + +from tests.entrypoints.pooling.scoring.util import EncoderScoringHfRunner +from tests.utils import RemoteOpenAIServer +from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse +from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse +from vllm.platforms import current_platform + +MODEL_NAME = "BAAI/bge-base-en-v1.5" +input_text = "This product was excellent and exceeded my expectations" +DTYPE = "half" +EMBEDDING_SIZE = 768 + +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + + +@pytest.fixture(scope="module") +def server(): + args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] + + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def hf_model(): + return EncoderScoringHfRunner(MODEL_NAME) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_1_documents_str_1( + hf_model, server: RemoteOpenAIServer +): + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1[0], + "documents": TEXTS_2[0], + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 1 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_1_documents_str_n( + hf_model, server: RemoteOpenAIServer +): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1[0], + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_n_documents_str_n( + hf_model, server: RemoteOpenAIServer +): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1, + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_vs_documents(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1, + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_vs_items(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1, + "items": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_text_1_vs_text_2(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "text_1": TEXTS_1, + "text_2": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_data_1_vs_data_2(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "data_1": TEXTS_1, + "data_2": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_rerank_api_texts(server: RemoteOpenAIServer): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={ + "model": MODEL_NAME, + "query": query, + "documents": documents, + }, + ) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + paris_result = next(r for r in rerank.results if r.index == 1) + brazil_result = next(r for r in rerank.results if r.index == 0) + assert paris_result.relevance_score > brazil_result.relevance_score + + +@pytest.mark.asyncio +async def test_rerank_api_top_n(server: RemoteOpenAIServer): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Cross-encoder models are neat", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={"model": MODEL_NAME, "query": query, "documents": documents, "top_n": 2}, + ) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].index == 1 + + +@pytest.mark.asyncio +async def test_rerank_api_max_model_len(server: RemoteOpenAIServer): + query = "What is the capital of France?" * 100 + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={"model": MODEL_NAME, "query": query, "documents": documents}, + ) + assert rerank_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input prompt" in rerank_response.text + + +@pytest.mark.asyncio +async def test_score_api_max_model_len(server: RemoteOpenAIServer): + queries = "What is the capital of France?" * 20 + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": queries, + "documents": documents, + }, + ) + assert score_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input prompt" in score_response.text + + # Test truncation + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": queries, + "documents": documents, + "truncate_prompt_tokens": 101, + }, + ) + assert score_response.status_code == 400 + assert "Please request a smaller truncation size." in score_response.text + + +@pytest.mark.asyncio +async def test_invocations(server: RemoteOpenAIServer): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + request_args = { + "model": MODEL_NAME, + "query": query, + "documents": documents, + } + + rerank_response = requests.post(server.url_for("rerank"), json=request_args) + rerank_response.raise_for_status() + + invocation_response = requests.post( + server.url_for("invocations"), json=request_args + ) + invocation_response.raise_for_status() + + rerank_output = rerank_response.json() + invocation_output = invocation_response.json() + + assert rerank_output.keys() == invocation_output.keys() + for rerank_result, invocations_result in zip( + rerank_output["results"], invocation_output["results"] + ): + assert rerank_result.keys() == invocations_result.keys() + assert rerank_result["relevance_score"] == pytest.approx( + invocations_result["relevance_score"], rel=0.01 + ) + + +@pytest.mark.asyncio +async def test_pooling_embed(server: RemoteOpenAIServer): + response = requests.post( + server.url_for("pooling"), + json={ + "model": MODEL_NAME, + "input": input_text, + "encoding_format": "float", + "task": "embed", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == EMBEDDING_SIZE + + +@pytest.mark.asyncio +@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"]) +async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str): + response = requests.post( + server.url_for("pooling"), + json={ + "model": MODEL_NAME, + "input": input_text, + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}") diff --git a/tests/entrypoints/pooling/score/test_correctness_mteb.py b/tests/entrypoints/pooling/scoring/test_cross_encoder_correctness_mteb.py similarity index 100% rename from tests/entrypoints/pooling/score/test_correctness_mteb.py rename to tests/entrypoints/pooling/scoring/test_cross_encoder_correctness_mteb.py diff --git a/tests/entrypoints/pooling/scoring/test_cross_encoder_offline.py b/tests/entrypoints/pooling/scoring/test_cross_encoder_offline.py new file mode 100644 index 000000000..cb76d7460 --- /dev/null +++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_offline.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import weakref + +import pytest +import torch + +from tests.models.utils import softmax +from vllm import LLM, PoolingParams +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" +PROMPT = "The chef prepared a delicious meal." +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + + +@pytest.fixture(scope="module") +def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM( + model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True, + seed=0, + attention_config=attention_config, + ) + + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def hf_model(hf_runner): + return hf_runner(MODEL_NAME, is_cross_encoder=True) + + +@pytest.mark.skip_global_cleanup +def test_1_to_1(llm, hf_model): + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + hf_outputs = hf_model.predict([text_pair]).tolist() + vllm_outputs = [ + output.outputs.score for output in llm.score(text_pair[0], text_pair[1]) + ] + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + + +@pytest.mark.skip_global_cleanup +def test_1_to_n(llm, hf_model): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)] + hf_outputs = hf_model.predict(text_pairs).tolist() + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) + + +@pytest.mark.skip_global_cleanup +def test_n_to_n(llm, hf_model): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)] + hf_outputs = hf_model.predict(text_pairs).tolist() + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) + + +@pytest.mark.skip_global_cleanup +def test_classify(llm): + outputs = llm.encode(PROMPT, pooling_task="classify", use_tqdm=False) + assert len(outputs) == 1 + assert len(outputs[0].outputs.data) == 1 + + +def test_pooling_params(llm: LLM): + def get_outputs(use_activation): + outputs = llm.score( + TEXTS_1[0], + TEXTS_2[0], + pooling_params=PoolingParams(use_activation=use_activation), + use_tqdm=False, + ) + return torch.tensor([x.outputs.score for x in outputs]) + + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) + + assert torch.allclose(default, w_activation, atol=1e-2), ( + "Default should use activation." + ) + assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( + "wo_activation should not use activation." + ) + assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), ( + "w_activation should be close to activation(wo_activation)." + ) diff --git a/tests/entrypoints/pooling/scoring/test_cross_encoder_online.py b/tests/entrypoints/pooling/scoring/test_cross_encoder_online.py new file mode 100644 index 000000000..c6747a464 --- /dev/null +++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_online.py @@ -0,0 +1,487 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import requests +import torch +import torch.nn.functional as F + +from tests.utils import RemoteOpenAIServer +from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse +from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse +from vllm.platforms import current_platform + +MODEL_NAME = "BAAI/bge-reranker-base" +DTYPE = "half" +input_text = "This product was excellent and exceeded my expectations" +input_tokens = [0, 3293, 12996, 509, 40881, 136, 204839, 297, 759, 202702, 2] + + +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + + +@pytest.fixture(scope="module") +def server(): + args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] + + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def hf_model(hf_runner): + return hf_runner(MODEL_NAME, is_cross_encoder=True) + + +@pytest.mark.asyncio +async def test_basic(server: RemoteOpenAIServer): + # test /v1/models + response = requests.get(server.url_for("/v1/models")) + served_model = response.json()["data"][0]["id"] + assert served_model == MODEL_NAME + + # test /tokenize + response = requests.post( + server.url_for("/tokenize"), + json={"model": MODEL_NAME, "prompt": input_text}, + ) + assert response.json()["tokens"] == input_tokens + + +@pytest.mark.asyncio +async def test_score_api_queries_str_1_documents_str_1( + hf_model, server: RemoteOpenAIServer +): + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1[0], + "documents": TEXTS_2[0], + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 1 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_1_documents_str_n( + hf_model, server: RemoteOpenAIServer +): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1[0], + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_n_documents_str_n( + hf_model, server: RemoteOpenAIServer +): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1, + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_vs_documents(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1, + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_vs_items(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1, + "items": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_text_1_vs_text_2(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "text_1": TEXTS_1, + "text_2": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_data_1_vs_data_2(hf_model, server: RemoteOpenAIServer): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "data_1": TEXTS_1, + "data_2": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_rerank_api_texts(server: RemoteOpenAIServer): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={ + "model": MODEL_NAME, + "query": query, + "documents": documents, + }, + ) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +async def test_rerank_api_top_n(server: RemoteOpenAIServer): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Cross-encoder models are neat", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={"model": MODEL_NAME, "query": query, "documents": documents, "top_n": 2}, + ) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +async def test_rerank_api_max_model_len(server: RemoteOpenAIServer): + query = "What is the capital of France?" * 100 + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={"model": MODEL_NAME, "query": query, "documents": documents}, + ) + assert rerank_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input prompt" in rerank_response.text + + +@pytest.mark.asyncio +async def test_score_api_max_model_len(server: RemoteOpenAIServer): + queries = "What is the capital of France?" * 20 + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": queries, + "documents": documents, + }, + ) + assert score_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input prompt" in score_response.text + + # Test truncation + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": queries, + "documents": documents, + "truncate_prompt_tokens": 101, + }, + ) + assert score_response.status_code == 400 + assert "Please request a smaller truncation size." in score_response.text + + +@pytest.mark.asyncio +async def test_invocations(server: RemoteOpenAIServer): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + request_args = { + "model": MODEL_NAME, + "query": query, + "documents": documents, + } + + rerank_response = requests.post(server.url_for("rerank"), json=request_args) + rerank_response.raise_for_status() + + invocation_response = requests.post( + server.url_for("invocations"), json=request_args + ) + invocation_response.raise_for_status() + + rerank_output = rerank_response.json() + invocation_output = invocation_response.json() + + assert rerank_output.keys() == invocation_output.keys() + for rerank_result, invocations_result in zip( + rerank_output["results"], invocation_output["results"] + ): + assert rerank_result.keys() == invocations_result.keys() + assert rerank_result["relevance_score"] == pytest.approx( + invocations_result["relevance_score"], rel=0.01 + ) + + +@pytest.mark.asyncio +async def test_use_activation(server: RemoteOpenAIServer): + async def get_outputs(use_activation): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + response = requests.post( + server.url_for("rerank"), + json={ + "model": MODEL_NAME, + "query": query, + "documents": documents, + "use_activation": use_activation, + }, + ) + outputs = response.json() + + return torch.tensor([x["relevance_score"] for x in outputs["results"]]) + + default = await get_outputs(use_activation=None) + w_activation = await get_outputs(use_activation=True) + wo_activation = await get_outputs(use_activation=False) + + assert torch.allclose(default, w_activation, atol=1e-2), ( + "Default should use activation." + ) + assert not torch.allclose(w_activation, wo_activation, atol=1e-2), ( + "wo_activation should not use activation." + ) + assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( + "w_activation should be close to activation(wo_activation)." + ) + + +@pytest.mark.asyncio +async def test_pooling_classify(server: RemoteOpenAIServer): + response = requests.post( + server.url_for("pooling"), + json={ + "model": MODEL_NAME, + "input": input_text, + "encoding_format": "float", + "task": "classify", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 1 + + +@pytest.mark.asyncio +async def test_pooling_token_classify(server: RemoteOpenAIServer): + response = requests.post( + server.url_for("pooling"), + json={ + "model": MODEL_NAME, + "task": "token_classify", + "input": input_text, + "encoding_format": "float", + }, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == len(input_tokens) + assert len(poolings.data[0].data[0]) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +async def test_pooling_not_supported(server: RemoteOpenAIServer, task: str): + response = requests.post( + server.url_for("pooling"), + json={ + "model": MODEL_NAME, + "input": input_text, + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}") diff --git a/tests/entrypoints/pooling/score/test_online_score_vision.py b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py similarity index 93% rename from tests/entrypoints/pooling/score/test_online_score_vision.py rename to tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py index b94335b54..cc4ba6a8e 100644 --- a/tests/entrypoints/pooling/score/test_online_score_vision.py +++ b/tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py @@ -123,7 +123,10 @@ def server(request): yield remote_server, backend -def test_score_api_queries_str_documents_str(server: tuple[RemoteOpenAIServer, str]): +@pytest.mark.asyncio +async def test_score_api_queries_str_documents_str( + server: tuple[RemoteOpenAIServer, str], +): remote_server, backend = server score_response = requests.post( remote_server.url_for("score"), @@ -143,7 +146,8 @@ def test_score_api_queries_str_documents_str(server: tuple[RemoteOpenAIServer, s assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text") -def test_score_api_queries_str_documents_text_content( +@pytest.mark.asyncio +async def test_score_api_queries_str_documents_text_content( server: tuple[RemoteOpenAIServer, str], ): remote_server, backend = server @@ -165,7 +169,8 @@ def test_score_api_queries_str_documents_text_content( assert_score(score.data[0].score, TEXT_VS_TEXT, backend, "text_vs_text") -def test_score_api_queries_str_documents_image_url_content( +@pytest.mark.asyncio +async def test_score_api_queries_str_documents_image_url_content( server: tuple[RemoteOpenAIServer, str], ): remote_server, backend = server @@ -187,7 +192,8 @@ def test_score_api_queries_str_documents_image_url_content( assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image") -def test_score_api_queries_str_documents_image_base64_content( +@pytest.mark.asyncio +async def test_score_api_queries_str_documents_image_base64_content( server: tuple[RemoteOpenAIServer, str], ): remote_server, backend = server @@ -209,7 +215,8 @@ def test_score_api_queries_str_documents_image_base64_content( assert_score(score.data[0].score, TEXT_VS_IMAGE, backend, "text_vs_image_base64") -def test_score_api_queries_str_documents_image_url_plus_text_content( +@pytest.mark.asyncio +async def test_score_api_queries_str_documents_image_url_plus_text_content( server: tuple[RemoteOpenAIServer, str], ): remote_server, backend = server @@ -233,7 +240,8 @@ def test_score_api_queries_str_documents_image_url_plus_text_content( ) -def test_score_api_queries_str_documents_list( +@pytest.mark.asyncio +async def test_score_api_queries_str_documents_list( server: tuple[RemoteOpenAIServer, str], ): remote_server, backend = server @@ -268,7 +276,8 @@ def test_score_api_queries_str_documents_list( ) -def test_rerank_api_queries_str_documents_list( +@pytest.mark.asyncio +async def test_rerank_api_queries_str_documents_list( server: tuple[RemoteOpenAIServer, str], ): remote_server, backend = server @@ -320,7 +329,8 @@ def test_rerank_api_queries_str_documents_list( ) -def test_score_api_queries_list_documents_list( +@pytest.mark.asyncio +async def test_score_api_queries_list_documents_list( server: tuple[RemoteOpenAIServer, str], ): remote_server, backend = server diff --git a/tests/entrypoints/pooling/scoring/test_late_interaction_offline.py b/tests/entrypoints/pooling/scoring/test_late_interaction_offline.py new file mode 100644 index 000000000..ea162d630 --- /dev/null +++ b/tests/entrypoints/pooling/scoring/test_late_interaction_offline.py @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import weakref + +import pytest + +from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +from .util import ColBERTScoringHfRunner + +MODEL_NAME = "answerdotai/answerai-colbert-small-v1" +COLBERT_DIM = 96 + +LINEAR_WEIGHTS_KEY = "linear.weight" +PROMPT = "The chef prepared a delicious meal." + +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + +DTYPE = "half" + + +@pytest.fixture(scope="module") +def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM( + model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True, + seed=0, + attention_config=attention_config, + ) + + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="module") +def hf_model(): + return ColBERTScoringHfRunner( + model_name=MODEL_NAME, linear_weights_key=LINEAR_WEIGHTS_KEY + ) + + +@pytest.mark.skip_global_cleanup +def test_1_to_1(llm, hf_model): + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + hf_outputs = hf_model.predict([text_pair]).tolist() + vllm_outputs = [ + output.outputs.score for output in llm.score(text_pair[0], text_pair[1]) + ] + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + + +@pytest.mark.skip_global_cleanup +def test_1_to_n(llm, hf_model): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + hf_outputs = hf_model.predict(text_pairs).tolist() + vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1[0], TEXTS_2)] + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) + + +@pytest.mark.skip_global_cleanup +def test_n_to_n(llm, hf_model): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + hf_outputs = hf_model.predict(text_pairs).tolist() + vllm_outputs = [output.outputs.score for output in llm.score(TEXTS_1, TEXTS_2)] + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) + + +def test_token_embed(llm): + outputs = llm.encode(PROMPT, pooling_task="token_embed", use_tqdm=False) + assert len(outputs) == 1 + assert outputs[0].outputs.data.shape == (9, COLBERT_DIM) diff --git a/tests/entrypoints/pooling/scoring/test_late_interaction_online.py b/tests/entrypoints/pooling/scoring/test_late_interaction_online.py new file mode 100644 index 000000000..77d1fa16c --- /dev/null +++ b/tests/entrypoints/pooling/scoring/test_late_interaction_online.py @@ -0,0 +1,232 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Online API tests for ColBERT late interaction scoring.""" + +import pytest +import requests + +from tests.utils import RemoteOpenAIServer +from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse + +from .util import ColBERTScoringHfRunner + +MODEL_NAME = "answerdotai/answerai-colbert-small-v1" +COLBERT_DIM = 96 +MAX_MODEL_LEN = 512 +LINEAR_WEIGHTS_KEY = "linear.weight" + +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--max-model-len", + str(MAX_MODEL_LEN), + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def hf_model(): + return ColBERTScoringHfRunner( + model_name=MODEL_NAME, linear_weights_key=LINEAR_WEIGHTS_KEY + ) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_1_documents_str_1( + hf_model, server: RemoteOpenAIServer +): + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1[0], + "documents": TEXTS_2[0], + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 1 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict([[TEXTS_1[0], TEXTS_2[0]]]).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_1_documents_str_n( + hf_model, server: RemoteOpenAIServer +): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1[0], + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_score_api_queries_str_n_documents_str_n( + hf_model, server: RemoteOpenAIServer +): + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + score_response = requests.post( + server.url_for("score"), + json={ + "model": MODEL_NAME, + "queries": TEXTS_1, + "documents": TEXTS_2, + }, + ) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + + vllm_outputs = [d.score for d in score.data] + hf_outputs = hf_model.predict(text_pairs).tolist() + + for i in range(len(vllm_outputs)): + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) + + +@pytest.mark.asyncio +async def test_rerank_api_texts(server: RemoteOpenAIServer): + """Test ColBERT rerank endpoint.""" + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={ + "model": MODEL_NAME, + "query": query, + "documents": documents, + }, + ) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + + paris_result = next(r for r in rerank.results if r.index == 1) + brazil_result = next(r for r in rerank.results if r.index == 0) + + assert paris_result.relevance_score > brazil_result.relevance_score + + +@pytest.mark.asyncio +async def test_rerank_api_top_n(server: RemoteOpenAIServer): + """Test ColBERT rerank with top_n parameter.""" + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Machine learning is a field of AI.", + ] + + rerank_response = requests.post( + server.url_for("rerank"), + json={ + "model": MODEL_NAME, + "query": query, + "documents": documents, + "top_n": 2, + }, + ) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert len(rerank.results) == 2 + assert rerank.results[0].index == 1 + + +@pytest.mark.asyncio +async def test_token_embed(server: RemoteOpenAIServer): + """Test ColBERT token_embed task via pooling endpoint.""" + text = "What is the capital of France?" + + pooling_response = requests.post( + server.url_for("pooling"), + json={ + "model": MODEL_NAME, + "input": text, + "task": "token_embed", + }, + ) + pooling_response.raise_for_status() + pooling = pooling_response.json() + + assert "data" in pooling + assert len(pooling["data"]) == 1 + + embeddings = pooling["data"][0]["data"] + assert isinstance(embeddings, list) + assert len(embeddings) > 0 + assert len(embeddings[0]) == COLBERT_DIM + + +@pytest.mark.asyncio +async def test_embed_not_supported(server: RemoteOpenAIServer): + """Test that ColBERT model does not support 'embed' task.""" + task = "embed" + text = "What is the capital of France?" + + response = requests.post( + server.url_for("pooling"), + json={ + "model": MODEL_NAME, + "input": text, + "task": task, + }, + ) + + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith(f"Unsupported task: {task!r}") diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/scoring/test_utils.py similarity index 100% rename from tests/entrypoints/pooling/score/test_utils.py rename to tests/entrypoints/pooling/scoring/test_utils.py diff --git a/tests/entrypoints/pooling/scoring/util.py b/tests/entrypoints/pooling/scoring/util.py new file mode 100644 index 000000000..6d9aa1524 --- /dev/null +++ b/tests/entrypoints/pooling/scoring/util.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.nn.functional as F +from huggingface_hub import hf_hub_download +from safetensors.torch import load_file +from transformers import AutoModel, AutoTokenizer + +from tests.conftest import HfRunner +from vllm.entrypoints.pooling.score.utils import compute_maxsim_score + + +class ColBERTScoringHfRunner(torch.nn.Module): + def __init__(self, model_name, linear_weights_key): + super().__init__() + + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + extra = {} + if self.device.type == "cpu": + extra["attn_implementation"] = "eager" + + self.model = AutoModel.from_pretrained( + model_name, + **extra, + ).to(self.device) + self.model.eval() + + path = hf_hub_download(model_name, filename="model.safetensors") + weights = load_file(path) + + self.linear_weight = weights[linear_weights_key].to(self.device).float() + + @torch.inference_mode() + def forward(self, texts): + embeddings = [] + for text in texts: + inputs = self.tokenizer(text, return_tensors="pt").to(self.device) + hidden = self.model(**inputs).last_hidden_state.float() + projected = F.linear(hidden, self.linear_weight.float()) + normalised = F.normalize(projected, p=2, dim=-1) + embeddings.append(normalised.squeeze(0).cpu()) + return embeddings + + @torch.inference_mode() + def predict(self, prompts: list[list[str]], *args, **kwargs): + hf_embeddings = [self(prompt) for prompt in prompts] + hf_outputs = [ + compute_maxsim_score(*map(torch.tensor, pair)).item() + for pair in hf_embeddings + ] + return torch.as_tensor(hf_outputs) + + +class EncoderScoringHfRunner(HfRunner): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, is_sentence_transformer=True) + + @torch.inference_mode() + def predict(self, prompts: list[list[str]], *args, **kwargs): + hf_embeddings = [self.encode(prompt) for prompt in prompts] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + return torch.as_tensor(hf_outputs) diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py deleted file mode 100644 index 416a43070..000000000 --- a/tests/models/language/pooling/test_scoring.py +++ /dev/null @@ -1,169 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest -import torch -import torch.nn.functional as F - -CROSS_ENCODER_MODELS = [ - "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert - "BAAI/bge-reranker-v2-m3", # Roberta -] - -EMBEDDING_MODELS = [ - "sentence-transformers/all-MiniLM-L12-v2", -] - -TEXTS_1 = [ - "What is the capital of France?", - "What is the capital of Germany?", -] - -TEXTS_2 = [ - "The capital of France is Paris.", - "The capital of Germany is Berlin.", -] - -DTYPE = "half" - - -@pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS) -def model_name(request): - yield request.param - - -def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name): - text_pair = [TEXTS_1[0], TEXTS_2[0]] - - with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: - hf_outputs = hf_model.predict([text_pair]).tolist() - - with vllm_runner( - model_name, runner="pooling", dtype=DTYPE, max_model_len=None - ) as vllm_model: - vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) - - assert len(vllm_outputs) == 1 - assert len(hf_outputs) == 1 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - - -def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): - text_pairs = [ - [TEXTS_1[0], TEXTS_2[0]], - [TEXTS_1[0], TEXTS_2[1]], - ] - - with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: - hf_outputs = hf_model.predict(text_pairs).tolist() - - with vllm_runner( - model_name, runner="pooling", dtype=DTYPE, max_model_len=None - ) as vllm_model: - vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) - - assert len(vllm_outputs) == 2 - assert len(hf_outputs) == 2 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) - - -def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): - text_pairs = [ - [TEXTS_1[0], TEXTS_2[0]], - [TEXTS_1[1], TEXTS_2[1]], - ] - - with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: - hf_outputs = hf_model.predict(text_pairs).tolist() - - with vllm_runner( - model_name, runner="pooling", dtype=DTYPE, max_model_len=None - ) as vllm_model: - vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) - - assert len(vllm_outputs) == 2 - assert len(hf_outputs) == 2 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) - - -@pytest.fixture(scope="module", params=EMBEDDING_MODELS) -def emb_model_name(request): - yield request.param - - -def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name): - text_pair = [TEXTS_1[0], TEXTS_2[0]] - - with hf_runner( - emb_model_name, dtype=DTYPE, is_sentence_transformer=True - ) as hf_model: - hf_embeddings = hf_model.encode(text_pair) - hf_outputs = [F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)] - - with vllm_runner( - emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None - ) as vllm_model: - vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) - - assert len(vllm_outputs) == 1 - assert len(hf_outputs) == 1 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - - -def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): - text_pairs = [ - [TEXTS_1[0], TEXTS_2[0]], - [TEXTS_1[0], TEXTS_2[1]], - ] - - with hf_runner( - emb_model_name, dtype=DTYPE, is_sentence_transformer=True - ) as hf_model: - hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs] - hf_outputs = [ - F.cosine_similarity(*map(torch.tensor, pair), dim=0) - for pair in hf_embeddings - ] - - with vllm_runner( - emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None - ) as vllm_model: - vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) - - assert len(vllm_outputs) == 2 - assert len(hf_outputs) == 2 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) - - -def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): - text_pairs = [ - [TEXTS_1[0], TEXTS_2[0]], - [TEXTS_1[1], TEXTS_2[1]], - ] - - with hf_runner( - emb_model_name, dtype=DTYPE, is_sentence_transformer=True - ) as hf_model: - hf_embeddings = [hf_model.encode(text_pair) for text_pair in text_pairs] - hf_outputs = [ - F.cosine_similarity(*map(torch.tensor, pair), dim=0) - for pair in hf_embeddings - ] - - with vllm_runner( - emb_model_name, runner="pooling", dtype=DTYPE, max_model_len=None - ) as vllm_model: - vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) - - assert len(vllm_outputs) == 2 - assert len(hf_outputs) == 2 - - assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) - assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 6ba515836..cab5a536c 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -461,7 +461,7 @@ class OpenAIServing: return self.create_error_response( "truncate_prompt_tokens value is " "greater than max_model_len." - " Please, select a smaller truncation size." + " Please request a smaller truncation size." ) return None @@ -724,7 +724,7 @@ class OpenAIServing: f"This model's maximum context length is " f"{max_model_len} tokens. However, you requested " f"{token_num} tokens in the input for {operation}. " - f"Please reduce the length of the input.", + f"Please reduce the length of the input prompt.", parameter="input_tokens", value=token_num, ) diff --git a/vllm/entrypoints/pooling/base/serving.py b/vllm/entrypoints/pooling/base/serving.py index 1f7238e27..60685e90e 100644 --- a/vllm/entrypoints/pooling/base/serving.py +++ b/vllm/entrypoints/pooling/base/serving.py @@ -224,7 +224,7 @@ class PoolingServing: raise ValueError( "truncate_prompt_tokens value is " "greater than max_model_len." - " Please, select a smaller truncation size." + " Please request a smaller truncation size." ) return None