Files
vllm/tests/entrypoints/openai/chat_completion/test_batched_chat_completions.py

114 lines
3.1 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import httpx
import pytest
from tests.utils import RemoteOpenAIServer
# any model with a chat template defined in tokenizer_config should work here
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_batched_chat_completions(
server: RemoteOpenAIServer, model_name: str
) -> None:
conversations = [
[{"role": "user", "content": "Reply with exactly the word: alpha"}],
[{"role": "user", "content": "Reply with exactly the word: beta"}],
]
async with httpx.AsyncClient() as http_client:
response = await http_client.post(
f"{server.url_for('v1/chat/completions/batch')}",
json={
"model": model_name,
"messages": conversations,
},
timeout=60,
)
assert response.status_code == 200, response.text
data = response.json()
choices = data["choices"]
assert len(choices) == 2
indices = {choice["index"] for choice in choices}
assert indices == {0, 1}
# Each conversation should produce a non-empty text response.
for choice in choices:
assert choice["message"]["content"]
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_batched_chat_completions_with_json_schema(
server: RemoteOpenAIServer, model_name: str
) -> None:
schema = {
"type": "object",
"properties": {
"answer": {"type": "string", "enum": ["yes", "no"]},
},
"required": ["answer"],
}
conversations = [
[{"role": "user", "content": "Is the sky blue? Answer in JSON."}],
[{"role": "user", "content": "Is fire cold? Answer in JSON."}],
]
async with httpx.AsyncClient() as http_client:
response = await http_client.post(
f"{server.url_for('v1/chat/completions/batch')}",
json={
"model": model_name,
"messages": conversations,
"response_format": {
"type": "json_schema",
"json_schema": {"name": "answer", "schema": schema, "strict": True},
},
},
timeout=60,
)
assert response.status_code == 200, response.text
data = response.json()
choices = data["choices"]
assert len(choices) == 2
for choice in choices:
parsed = json.loads(choice["message"]["content"])
assert "answer" in parsed
assert parsed["answer"] in ("yes", "no")