[Refactor] Relocate entrypoint tests to match serving code structure (#37593)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
This commit is contained in:
193
tests/entrypoints/serve/render/test_launch_render.py
Normal file
193
tests/entrypoints/serve/render/test_launch_render.py
Normal file
@@ -0,0 +1,193 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""E2E tests for render endpoints via `vllm launch` (GPU-less serving)."""
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteLaunchRenderServer
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args: list[str] = []
|
||||
with RemoteLaunchRenderServer(MODEL_NAME, args, max_wait_seconds=120) as srv:
|
||||
yield srv
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with httpx.AsyncClient(
|
||||
base_url=server.url_for(""), timeout=30.0
|
||||
) as http_client:
|
||||
yield http_client
|
||||
|
||||
|
||||
# -- Chat Completion Render --
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_render_basic(client):
|
||||
response = await client.post(
|
||||
"/v1/chat/completions/render",
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"messages": [{"role": "user", "content": "Hello, how are you?"}],
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
# Response should be a GenerateRequest dict
|
||||
assert isinstance(data, dict)
|
||||
assert "token_ids" in data
|
||||
assert isinstance(data["token_ids"], list)
|
||||
assert len(data["token_ids"]) > 0
|
||||
assert all(isinstance(t, int) for t in data["token_ids"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_render_multi_turn(client):
|
||||
response = await client.post(
|
||||
"/v1/chat/completions/render",
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hello"},
|
||||
{"role": "assistant", "content": "Hi there!"},
|
||||
{"role": "user", "content": "How are you?"},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
assert isinstance(data, dict)
|
||||
assert "token_ids" in data
|
||||
assert isinstance(data["token_ids"], list)
|
||||
assert len(data["token_ids"]) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_render_invalid_model(client):
|
||||
response = await client.post(
|
||||
"/v1/chat/completions/render",
|
||||
json={
|
||||
"model": "nonexistent-model",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 404
|
||||
assert "error" in response.json()
|
||||
|
||||
|
||||
# -- Completion Render --
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_render_basic(client):
|
||||
response = await client.post(
|
||||
"/v1/completions/render",
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"prompt": "Once upon a time",
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
assert isinstance(data, list)
|
||||
assert len(data) > 0
|
||||
|
||||
first_prompt = data[0]
|
||||
assert "token_ids" in first_prompt
|
||||
assert "sampling_params" in first_prompt
|
||||
assert "model" in first_prompt
|
||||
assert "request_id" in first_prompt
|
||||
assert isinstance(first_prompt["token_ids"], list)
|
||||
assert len(first_prompt["token_ids"]) > 0
|
||||
assert first_prompt["request_id"].startswith("cmpl-")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_render_multiple_prompts(client):
|
||||
response = await client.post(
|
||||
"/v1/completions/render",
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"prompt": ["Hello world", "Goodbye world"],
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
|
||||
assert isinstance(data, list)
|
||||
assert len(data) == 2
|
||||
|
||||
for prompt in data:
|
||||
assert "token_ids" in prompt
|
||||
assert "sampling_params" in prompt
|
||||
assert "model" in prompt
|
||||
assert "request_id" in prompt
|
||||
assert len(prompt["token_ids"]) > 0
|
||||
assert prompt["request_id"].startswith("cmpl-")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_render_invalid_model(client):
|
||||
response = await client.post(
|
||||
"/v1/completions/render",
|
||||
json={
|
||||
"model": "nonexistent-model",
|
||||
"prompt": "Hello",
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 404
|
||||
assert "error" in response.json()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_render_is_fast(client):
|
||||
"""Render should complete quickly since there is no inference."""
|
||||
import time
|
||||
|
||||
start = time.perf_counter()
|
||||
response = await client.post(
|
||||
"/v1/completions/render",
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"prompt": "Tell me a very long story about " * 10,
|
||||
},
|
||||
)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
assert response.status_code == 200
|
||||
assert elapsed < 2.0
|
||||
|
||||
|
||||
# -- Health & Models --
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_health_endpoint(client):
|
||||
response = await client.get("/health")
|
||||
assert response.status_code == 200
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_models_endpoint(client):
|
||||
response = await client.get("/v1/models")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "data" in data
|
||||
model_ids = [m["id"] for m in data["data"]]
|
||||
assert MODEL_NAME in model_ids
|
||||
Reference in New Issue
Block a user