# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import pytest_asyncio from openai import OpenAI from tests.utils import RemoteOpenAIServer from .conftest import validate_streaming_event_stack MODEL_NAME = "Qwen/Qwen3-8B" @pytest.fixture(scope="module") def server(): from .conftest import BASE_TEST_ENV args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"] env_dict = { **BASE_TEST_ENV, "VLLM_ENABLE_RESPONSES_API_STORE": "1", # uncomment for tool calling # PYTHON_EXECUTION_BACKEND: "dangerously_use_uv", } with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server: yield remote_server @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: yield async_client @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_basic(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, input="What is 123 * 456?", ) assert response is not None print("response: ", response) assert response.status == "completed" assert response.incomplete_details is None @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_enable_response_messages(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, input="Hello?", extra_body={"enable_response_messages": True}, ) assert response.status == "completed" assert response.input_messages[0]["type"] == "raw_message_tokens" assert type(response.input_messages[0]["message"]) is str assert len(response.input_messages[0]["message"]) > 10 assert type(response.input_messages[0]["tokens"][0]) is int assert type(response.output_messages[0]["message"]) is str assert len(response.output_messages[0]["message"]) > 10 assert type(response.output_messages[0]["tokens"][0]) is int @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_reasoning_item(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, input=[ {"type": "message", "content": "Hello.", "role": "user"}, { "type": "reasoning", "id": "lol", "content": [ { "type": "reasoning_text", "text": "We need to respond: greeting.", } ], "summary": [], }, ], temperature=0.0, ) assert response is not None assert response.status == "completed" # make sure we get a reasoning and text output assert response.output[0].type == "reasoning" assert response.output[1].type == "message" assert type(response.output[1].content[0].text) is str @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_streaming_output_consistency(client: OpenAI, model_name: str): """Test that streaming delta text matches the final response output_text. This test verifies that when using streaming mode: 1. The concatenated text from all 'response.output_text.delta' events 2. Matches the 'output_text' in the final 'response.completed' event """ response = await client.responses.create( model=model_name, input="Say hello in one sentence.", stream=True, ) events = [] async for event in response: events.append(event) assert len(events) > 0 # Concatenate all delta text from streaming events streaming_text = "".join( event.delta for event in events if event.type == "response.output_text.delta" ) # Get the final response from the last event response_completed_event = events[-1] assert response_completed_event.type == "response.completed" assert response_completed_event.response.status == "completed" # Get output_text from the final response final_output_text = response_completed_event.response.output_text # Verify final response has output assert len(response_completed_event.response.output) > 0 # Verify streaming text matches final output_text assert streaming_text == final_output_text, ( f"Streaming text does not match final output_text.\n" f"Streaming: {streaming_text!r}\n" f"Final: {final_output_text!r}" ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_streaming_logprobs(client: OpenAI, model_name: str): """Test that streaming with logprobs returns valid logprob data on output_text.delta events and that top_logprobs has the requested count.""" response = await client.responses.create( model=model_name, input="Say hello.", stream=True, top_logprobs=3, include=["message.output_text.logprobs"], ) events = [] async for event in response: events.append(event) assert len(events) > 0 # Collect all output_text.delta events that carry logprobs text_delta_events = [e for e in events if e.type == "response.output_text.delta"] assert len(text_delta_events) > 0, "Expected at least one text delta event" for delta_event in text_delta_events: logprobs = delta_event.logprobs assert logprobs is not None, "logprobs should be present on text delta events" assert len(logprobs) > 0, "logprobs list should not be empty" for lp in logprobs: # Each logprob entry must have a token and a logprob value assert lp.token is not None assert isinstance(lp.logprob, float) assert lp.logprob <= 0.0, f"logprob should be <= 0, got {lp.logprob}" # top_logprobs should have up to 3 entries assert lp.top_logprobs is not None assert len(lp.top_logprobs) <= 3 for tl in lp.top_logprobs: assert tl.token is not None assert isinstance(tl.logprob, float) # Verify that top_logprobs are actually populated, not always empty all_top_logprobs = [ tl for e in text_delta_events for lp in e.logprobs for tl in lp.top_logprobs ] assert len(all_top_logprobs) > 0, ( "Expected at least one top_logprobs entry across all delta events" ) # Verify the completed event still has valid output completed = events[-1] assert completed.type == "response.completed" assert completed.response.status == "completed" @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_streaming_reasoning_tokens_e2e(client: OpenAI, model_name: str): """Verify final usage includes reasoning_tokens in streaming mode.""" response = await client.responses.create( model=model_name, input="Compute 17 * 19 and explain briefly.", reasoning={"effort": "low"}, temperature=0.0, stream=True, ) completed_event = None async for event in response: if event.type == "response.completed": completed_event = event assert completed_event is not None assert completed_event.response.status == "completed" assert completed_event.response.usage is not None assert completed_event.response.usage.output_tokens_details is not None assert completed_event.response.usage.output_tokens_details.reasoning_tokens > 0, ( "Expected reasoning_tokens > 0 for streamed Qwen3 response." ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_non_streaming_reasoning_tokens_e2e(client: OpenAI, model_name: str): """Verify usage includes reasoning_tokens in non-streaming mode.""" response = await client.responses.create( model=model_name, input="Compute 23 * 17 and explain briefly.", reasoning={"effort": "low"}, temperature=0.0, stream=False, ) assert response is not None assert response.status == "completed" assert response.usage is not None assert response.usage.output_tokens_details is not None assert response.usage.output_tokens_details.reasoning_tokens > 0, ( "Expected reasoning_tokens > 0 for non-streamed Qwen3 response." ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_max_tokens(client: OpenAI, model_name: str): response = await client.responses.create( model=model_name, input="What is the first paragraph of Moby Dick?", reasoning={"effort": "low"}, max_output_tokens=30, ) assert response is not None assert response.status == "incomplete" assert response.incomplete_details.reason == "max_output_tokens" @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_extra_sampling_params(client: OpenAI, model_name: str): """Test that extra sampling parameters are accepted and work.""" # Test with multiple sampling parameters - just verify they're accepted response = await client.responses.create( model=model_name, input="Write a short sentence", max_output_tokens=50, temperature=0.7, top_p=0.9, extra_body={ "top_k": 40, "repetition_penalty": 1.2, "seed": 42, }, ) # Verify request succeeded and parameters were accepted assert response.status in ["completed", "incomplete"] assert len(response.output) > 0 assert response.output[0].content[0].text # Has text output @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_streaming_types( pairs_of_event_types: dict[str, str], client: OpenAI, model_name: str ): stream = await client.responses.create( model=model_name, input="tell me a story about a cat in 20 words", reasoning={"effort": "low"}, tools=[], stream=True, background=False, ) events = [] async for event in stream: events.append(event) validate_streaming_event_stack(events, pairs_of_event_types)