297 lines
10 KiB
Python
297 lines
10 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import pytest
|
|
import pytest_asyncio
|
|
from openai import OpenAI
|
|
|
|
from tests.utils import RemoteOpenAIServer
|
|
|
|
from .conftest import validate_streaming_event_stack
|
|
|
|
MODEL_NAME = "Qwen/Qwen3-8B"
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def server():
|
|
from .conftest import BASE_TEST_ENV
|
|
|
|
args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
|
|
env_dict = {
|
|
**BASE_TEST_ENV,
|
|
"VLLM_ENABLE_RESPONSES_API_STORE": "1",
|
|
# uncomment for tool calling
|
|
# PYTHON_EXECUTION_BACKEND: "dangerously_use_uv",
|
|
}
|
|
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
|
|
yield remote_server
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def client(server):
|
|
async with server.get_async_client() as async_client:
|
|
yield async_client
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_basic(client: OpenAI, model_name: str):
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="What is 123 * 456?",
|
|
)
|
|
assert response is not None
|
|
print("response: ", response)
|
|
assert response.status == "completed"
|
|
assert response.incomplete_details is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_enable_response_messages(client: OpenAI, model_name: str):
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="Hello?",
|
|
extra_body={"enable_response_messages": True},
|
|
)
|
|
assert response.status == "completed"
|
|
assert response.input_messages[0]["type"] == "raw_message_tokens"
|
|
assert type(response.input_messages[0]["message"]) is str
|
|
assert len(response.input_messages[0]["message"]) > 10
|
|
assert type(response.input_messages[0]["tokens"][0]) is int
|
|
assert type(response.output_messages[0]["message"]) is str
|
|
assert len(response.output_messages[0]["message"]) > 10
|
|
assert type(response.output_messages[0]["tokens"][0]) is int
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_reasoning_item(client: OpenAI, model_name: str):
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input=[
|
|
{"type": "message", "content": "Hello.", "role": "user"},
|
|
{
|
|
"type": "reasoning",
|
|
"id": "lol",
|
|
"content": [
|
|
{
|
|
"type": "reasoning_text",
|
|
"text": "We need to respond: greeting.",
|
|
}
|
|
],
|
|
"summary": [],
|
|
},
|
|
],
|
|
temperature=0.0,
|
|
)
|
|
assert response is not None
|
|
assert response.status == "completed"
|
|
# make sure we get a reasoning and text output
|
|
assert response.output[0].type == "reasoning"
|
|
assert response.output[1].type == "message"
|
|
assert type(response.output[1].content[0].text) is str
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_streaming_output_consistency(client: OpenAI, model_name: str):
|
|
"""Test that streaming delta text matches the final response output_text.
|
|
|
|
This test verifies that when using streaming mode:
|
|
1. The concatenated text from all 'response.output_text.delta' events
|
|
2. Matches the 'output_text' in the final 'response.completed' event
|
|
"""
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="Say hello in one sentence.",
|
|
stream=True,
|
|
)
|
|
|
|
events = []
|
|
async for event in response:
|
|
events.append(event)
|
|
|
|
assert len(events) > 0
|
|
|
|
# Concatenate all delta text from streaming events
|
|
streaming_text = "".join(
|
|
event.delta for event in events if event.type == "response.output_text.delta"
|
|
)
|
|
|
|
# Get the final response from the last event
|
|
response_completed_event = events[-1]
|
|
assert response_completed_event.type == "response.completed"
|
|
assert response_completed_event.response.status == "completed"
|
|
|
|
# Get output_text from the final response
|
|
final_output_text = response_completed_event.response.output_text
|
|
|
|
# Verify final response has output
|
|
assert len(response_completed_event.response.output) > 0
|
|
|
|
# Verify streaming text matches final output_text
|
|
assert streaming_text == final_output_text, (
|
|
f"Streaming text does not match final output_text.\n"
|
|
f"Streaming: {streaming_text!r}\n"
|
|
f"Final: {final_output_text!r}"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_streaming_logprobs(client: OpenAI, model_name: str):
|
|
"""Test that streaming with logprobs returns valid logprob data on
|
|
output_text.delta events and that top_logprobs has the requested count."""
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="Say hello.",
|
|
stream=True,
|
|
top_logprobs=3,
|
|
include=["message.output_text.logprobs"],
|
|
)
|
|
|
|
events = []
|
|
async for event in response:
|
|
events.append(event)
|
|
|
|
assert len(events) > 0
|
|
|
|
# Collect all output_text.delta events that carry logprobs
|
|
text_delta_events = [e for e in events if e.type == "response.output_text.delta"]
|
|
assert len(text_delta_events) > 0, "Expected at least one text delta event"
|
|
|
|
for delta_event in text_delta_events:
|
|
logprobs = delta_event.logprobs
|
|
assert logprobs is not None, "logprobs should be present on text delta events"
|
|
assert len(logprobs) > 0, "logprobs list should not be empty"
|
|
for lp in logprobs:
|
|
# Each logprob entry must have a token and a logprob value
|
|
assert lp.token is not None
|
|
assert isinstance(lp.logprob, float)
|
|
assert lp.logprob <= 0.0, f"logprob should be <= 0, got {lp.logprob}"
|
|
# top_logprobs should have up to 3 entries
|
|
assert lp.top_logprobs is not None
|
|
assert len(lp.top_logprobs) <= 3
|
|
for tl in lp.top_logprobs:
|
|
assert tl.token is not None
|
|
assert isinstance(tl.logprob, float)
|
|
|
|
# Verify that top_logprobs are actually populated, not always empty
|
|
all_top_logprobs = [
|
|
tl for e in text_delta_events for lp in e.logprobs for tl in lp.top_logprobs
|
|
]
|
|
assert len(all_top_logprobs) > 0, (
|
|
"Expected at least one top_logprobs entry across all delta events"
|
|
)
|
|
|
|
# Verify the completed event still has valid output
|
|
completed = events[-1]
|
|
assert completed.type == "response.completed"
|
|
assert completed.response.status == "completed"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_streaming_reasoning_tokens_e2e(client: OpenAI, model_name: str):
|
|
"""Verify final usage includes reasoning_tokens in streaming mode."""
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="Compute 17 * 19 and explain briefly.",
|
|
reasoning={"effort": "low"},
|
|
temperature=0.0,
|
|
stream=True,
|
|
)
|
|
|
|
completed_event = None
|
|
async for event in response:
|
|
if event.type == "response.completed":
|
|
completed_event = event
|
|
|
|
assert completed_event is not None
|
|
assert completed_event.response.status == "completed"
|
|
assert completed_event.response.usage is not None
|
|
assert completed_event.response.usage.output_tokens_details is not None
|
|
assert completed_event.response.usage.output_tokens_details.reasoning_tokens > 0, (
|
|
"Expected reasoning_tokens > 0 for streamed Qwen3 response."
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_non_streaming_reasoning_tokens_e2e(client: OpenAI, model_name: str):
|
|
"""Verify usage includes reasoning_tokens in non-streaming mode."""
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="Compute 23 * 17 and explain briefly.",
|
|
reasoning={"effort": "low"},
|
|
temperature=0.0,
|
|
stream=False,
|
|
)
|
|
|
|
assert response is not None
|
|
assert response.status == "completed"
|
|
assert response.usage is not None
|
|
assert response.usage.output_tokens_details is not None
|
|
assert response.usage.output_tokens_details.reasoning_tokens > 0, (
|
|
"Expected reasoning_tokens > 0 for non-streamed Qwen3 response."
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_max_tokens(client: OpenAI, model_name: str):
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="What is the first paragraph of Moby Dick?",
|
|
reasoning={"effort": "low"},
|
|
max_output_tokens=30,
|
|
)
|
|
assert response is not None
|
|
assert response.status == "incomplete"
|
|
assert response.incomplete_details.reason == "max_output_tokens"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_extra_sampling_params(client: OpenAI, model_name: str):
|
|
"""Test that extra sampling parameters are accepted and work."""
|
|
# Test with multiple sampling parameters - just verify they're accepted
|
|
response = await client.responses.create(
|
|
model=model_name,
|
|
input="Write a short sentence",
|
|
max_output_tokens=50,
|
|
temperature=0.7,
|
|
top_p=0.9,
|
|
extra_body={
|
|
"top_k": 40,
|
|
"repetition_penalty": 1.2,
|
|
"seed": 42,
|
|
},
|
|
)
|
|
|
|
# Verify request succeeded and parameters were accepted
|
|
assert response.status in ["completed", "incomplete"]
|
|
assert len(response.output) > 0
|
|
assert response.output[0].content[0].text # Has text output
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
async def test_streaming_types(
|
|
pairs_of_event_types: dict[str, str], client: OpenAI, model_name: str
|
|
):
|
|
stream = await client.responses.create(
|
|
model=model_name,
|
|
input="tell me a story about a cat in 20 words",
|
|
reasoning={"effort": "low"},
|
|
tools=[],
|
|
stream=True,
|
|
background=False,
|
|
)
|
|
events = []
|
|
async for event in stream:
|
|
events.append(event)
|
|
|
|
validate_streaming_event_stack(events, pairs_of_event_types)
|