114 lines
3.1 KiB
Python
114 lines
3.1 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import json
|
|
|
|
import httpx
|
|
import pytest
|
|
|
|
from tests.utils import RemoteOpenAIServer
|
|
|
|
# any model with a chat template defined in tokenizer_config should work here
|
|
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def default_server_args():
|
|
return [
|
|
# use half precision for speed and memory savings in CI environment
|
|
"--max-model-len",
|
|
"2048",
|
|
"--max-num-seqs",
|
|
"128",
|
|
"--enforce-eager",
|
|
]
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def server(default_server_args):
|
|
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
|
yield remote_server
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"model_name",
|
|
[MODEL_NAME],
|
|
)
|
|
async def test_batched_chat_completions(
|
|
server: RemoteOpenAIServer, model_name: str
|
|
) -> None:
|
|
conversations = [
|
|
[{"role": "user", "content": "Reply with exactly the word: alpha"}],
|
|
[{"role": "user", "content": "Reply with exactly the word: beta"}],
|
|
]
|
|
|
|
async with httpx.AsyncClient() as http_client:
|
|
response = await http_client.post(
|
|
f"{server.url_for('v1/chat/completions/batch')}",
|
|
json={
|
|
"model": model_name,
|
|
"messages": conversations,
|
|
},
|
|
timeout=60,
|
|
)
|
|
|
|
assert response.status_code == 200, response.text
|
|
data = response.json()
|
|
|
|
choices = data["choices"]
|
|
assert len(choices) == 2
|
|
|
|
indices = {choice["index"] for choice in choices}
|
|
assert indices == {0, 1}
|
|
|
|
# Each conversation should produce a non-empty text response.
|
|
for choice in choices:
|
|
assert choice["message"]["content"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"model_name",
|
|
[MODEL_NAME],
|
|
)
|
|
async def test_batched_chat_completions_with_json_schema(
|
|
server: RemoteOpenAIServer, model_name: str
|
|
) -> None:
|
|
schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"answer": {"type": "string", "enum": ["yes", "no"]},
|
|
},
|
|
"required": ["answer"],
|
|
}
|
|
conversations = [
|
|
[{"role": "user", "content": "Is the sky blue? Answer in JSON."}],
|
|
[{"role": "user", "content": "Is fire cold? Answer in JSON."}],
|
|
]
|
|
|
|
async with httpx.AsyncClient() as http_client:
|
|
response = await http_client.post(
|
|
f"{server.url_for('v1/chat/completions/batch')}",
|
|
json={
|
|
"model": model_name,
|
|
"messages": conversations,
|
|
"response_format": {
|
|
"type": "json_schema",
|
|
"json_schema": {"name": "answer", "schema": schema, "strict": True},
|
|
},
|
|
},
|
|
timeout=60,
|
|
)
|
|
|
|
assert response.status_code == 200, response.text
|
|
data = response.json()
|
|
|
|
choices = data["choices"]
|
|
assert len(choices) == 2
|
|
|
|
for choice in choices:
|
|
parsed = json.loads(choice["message"]["content"])
|
|
assert "answer" in parsed
|
|
assert parsed["answer"] in ("yes", "no")
|