Files
vllm/tests/entrypoints/openai/test_serving_responses.py
2026-02-25 08:08:16 -08:00

557 lines
18 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import AsyncExitStack
from unittest.mock import MagicMock
import pytest
import pytest_asyncio
from openai.types.responses.tool import (
CodeInterpreterContainerCodeInterpreterToolAuto,
LocalShell,
Mcp,
Tool,
)
import vllm.envs as envs
from vllm.entrypoints.mcp.tool_server import ToolServer
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.responses.context import ConversationContext, SimpleContext
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.entrypoints.openai.responses.serving import (
OpenAIServingResponses,
_extract_allowed_tools_from_mcp_requests,
extract_tool_types,
)
from vllm.entrypoints.openai.responses.streaming_events import (
StreamingState,
)
from vllm.inputs.data import TokensPrompt
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import SamplingParams
class MockConversationContext(ConversationContext):
"""Mock conversation context for testing"""
def __init__(self):
self.init_tool_sessions_called = False
self.init_tool_sessions_args = None
self.init_tool_sessions_kwargs = None
def append_output(self, output) -> None:
pass
def append_tool_output(self, output) -> None:
pass
async def call_tool(self):
return []
def need_builtin_tool_call(self) -> bool:
return False
def render_for_completion(self):
return []
async def init_tool_sessions(self, tool_server, exit_stack, request_id, mcp_tools):
self.init_tool_sessions_called = True
self.init_tool_sessions_args = (tool_server, exit_stack, request_id, mcp_tools)
async def cleanup_session(self) -> None:
pass
@pytest.fixture
def mock_serving_responses():
"""Create a mock OpenAIServingResponses instance"""
serving_responses = MagicMock(spec=OpenAIServingResponses)
serving_responses.tool_server = MagicMock(spec=ToolServer)
return serving_responses
@pytest.fixture
def mock_context():
"""Create a mock conversation context"""
return MockConversationContext()
@pytest.fixture
def mock_exit_stack():
"""Create a mock async exit stack"""
return MagicMock(spec=AsyncExitStack)
def test_extract_tool_types(monkeypatch: pytest.MonkeyPatch) -> None:
tools: list[Tool] = []
assert extract_tool_types(tools) == set()
tools.append(LocalShell(type="local_shell"))
assert extract_tool_types(tools) == {"local_shell"}
tools.append(CodeInterpreterContainerCodeInterpreterToolAuto(type="auto"))
assert extract_tool_types(tools) == {"local_shell", "auto"}
tools.extend(
[
Mcp(type="mcp", server_label="random", server_url=""),
Mcp(type="mcp", server_label="container", server_url=""),
Mcp(type="mcp", server_label="code_interpreter", server_url=""),
Mcp(type="mcp", server_label="web_search_preview", server_url=""),
]
)
# When envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS is not set,
# mcp tool types are all ignored.
assert extract_tool_types(tools) == {"local_shell", "auto"}
# container is allowed, it would be extracted
monkeypatch.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "container")
assert extract_tool_types(tools) == {"local_shell", "auto", "container"}
# code_interpreter and web_search_preview are allowed,
# they would be extracted
monkeypatch.setenv(
"VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,web_search_preview"
)
assert extract_tool_types(tools) == {
"local_shell",
"auto",
"code_interpreter",
"web_search_preview",
}
class TestInitializeToolSessions:
"""Test class for _initialize_tool_sessions method"""
@pytest_asyncio.fixture
async def serving_responses_instance(self):
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
model_config = MagicMock()
model_config.max_model_len = 100
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
engine_client.renderer = MagicMock()
models = MagicMock()
tool_server = MagicMock(spec=ToolServer)
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
models=models,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
tool_server=tool_server,
)
return instance
@pytest.mark.asyncio
async def test_initialize_tool_sessions(
self, serving_responses_instance, mock_context, mock_exit_stack
):
"""Test that method works correctly with only MCP tools"""
request = ResponsesRequest(input="test input", tools=[])
# Call the method
await serving_responses_instance._initialize_tool_sessions(
request, mock_context, mock_exit_stack
)
assert mock_context.init_tool_sessions_called is False
# Create only MCP tools
tools = [
{"type": "web_search_preview"},
{"type": "code_interpreter", "container": {"type": "auto"}},
]
request = ResponsesRequest(input="test input", tools=tools)
# Call the method
await serving_responses_instance._initialize_tool_sessions(
request, mock_context, mock_exit_stack
)
# Verify that init_tool_sessions was called
assert mock_context.init_tool_sessions_called
def test_validate_create_responses_input(
self, serving_responses_instance, mock_context, mock_exit_stack
):
request = ResponsesRequest(
input="test input",
previous_input_messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is my horoscope? I am an Aquarius.",
}
],
}
],
previous_response_id="lol",
)
error = serving_responses_instance._validate_create_responses_input(request)
assert error is not None
assert error.error.type == "invalid_request_error"
class TestValidateGeneratorInput:
"""Test class for _validate_generator_input method"""
@pytest_asyncio.fixture
async def serving_responses_instance(self):
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
model_config = MagicMock()
model_config.max_model_len = 100
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
engine_client.renderer = MagicMock()
models = MagicMock()
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
models=models,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
return instance
def test_validate_generator_input(self, serving_responses_instance):
"""Test _validate_generator_input with valid prompt length"""
# Create an engine prompt with valid length (less than max_model_len)
valid_prompt_token_ids = list(range(5)) # 5 tokens < 100 max_model_len
engine_prompt = TokensPrompt(prompt_token_ids=valid_prompt_token_ids)
# Call the method
result = serving_responses_instance._validate_generator_input(engine_prompt)
# Should return None for valid input
assert result is None
# create an invalid engine prompt
invalid_prompt_token_ids = list(range(200)) # 100 tokens >= 100 max_model_len
engine_prompt = TokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
# Call the method
result = serving_responses_instance._validate_generator_input(engine_prompt)
# Should return an ErrorResponse
assert result is not None
assert isinstance(result, ErrorResponse)
@pytest.mark.asyncio
async def test_reasoning_tokens_counted_for_text_reasoning_model(monkeypatch):
"""Ensure reasoning_tokens usage is derived from thinking token spans."""
class FakeTokenizer:
def __init__(self):
self._vocab = {"<think>": 1, "</think>": 2, "reason": 3, "final": 4}
def get_vocab(self):
return self._vocab
# Force non-harmony, SimpleContext path
monkeypatch.setattr(envs, "VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT", False)
engine_client = MagicMock()
model_config = MagicMock()
model_config.hf_config.model_type = "test"
model_config.hf_text_config = MagicMock()
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
engine_client.renderer = MagicMock()
tokenizer = FakeTokenizer()
engine_client.renderer.get_tokenizer.return_value = tokenizer
models = MagicMock()
serving = OpenAIServingResponses(
engine_client=engine_client,
models=models,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
reasoning_parser="qwen3",
)
# Build a SimpleContext with thinking tokens in the output.
context = SimpleContext()
token_ids = [1, 10, 2, 20] # <think> 10 </think> 20 -> reasoning token count = 1
completion = CompletionOutput(
index=0,
text="<think>reason</think>final",
token_ids=token_ids,
cumulative_logprob=0.0,
logprobs=None,
finish_reason="stop",
stop_reason=None,
)
req_output = RequestOutput(
request_id="req",
prompt="hi",
prompt_token_ids=[7, 8],
prompt_logprobs=None,
outputs=[completion],
finished=True,
num_cached_tokens=0,
)
context.append_output(req_output)
async def dummy_result_generator():
yield None
request = ResponsesRequest(input="hi", tools=[], stream=False)
sampling_params = SamplingParams(max_tokens=16)
metadata = RequestResponseMetadata(request_id="req")
response = await serving.responses_full_generator(
request=request,
sampling_params=sampling_params,
result_generator=dummy_result_generator(),
context=context,
model_name="test-model",
tokenizer=tokenizer,
request_metadata=metadata,
)
assert response.usage.output_tokens_details.reasoning_tokens == 1
class TestExtractAllowedToolsFromMcpRequests:
"""Test class for _extract_allowed_tools_from_mcp_requests function"""
def test_extract_allowed_tools_basic_formats(self):
"""Test extraction with list format, object format, and None."""
from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
tools = [
# List format
Mcp(
type="mcp",
server_label="server1",
allowed_tools=["tool1", "tool2"],
),
# Object format
Mcp(
type="mcp",
server_label="server2",
allowed_tools=McpAllowedToolsMcpToolFilter(
tool_names=["tool3", "tool4"]
),
),
# None (no filter)
Mcp(
type="mcp",
server_label="server3",
allowed_tools=None,
),
]
result = _extract_allowed_tools_from_mcp_requests(tools)
assert result == {
"server1": ["tool1", "tool2"],
"server2": ["tool3", "tool4"],
"server3": None,
}
def test_extract_allowed_tools_star_normalization(self):
"""Test that '*' wildcard is normalized to None (select all tools).
This is the key test requested by reviewers to explicitly demonstrate
that the "*" select-all scenario is handled correctly.
"""
from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
tools = [
# Star in list format
Mcp(
type="mcp",
server_label="server1",
allowed_tools=["*"],
),
# Star mixed with other tools in list
Mcp(
type="mcp",
server_label="server2",
allowed_tools=["tool1", "*"],
),
# Star in object format
Mcp(
type="mcp",
server_label="server3",
allowed_tools=McpAllowedToolsMcpToolFilter(tool_names=["*"]),
),
]
result = _extract_allowed_tools_from_mcp_requests(tools)
# All should be normalized to None (allows all tools)
assert result == {
"server1": None,
"server2": None,
"server3": None,
}
def test_extract_allowed_tools_filters_non_mcp(self):
"""Test that non-MCP tools are ignored during extraction."""
tools = [
Mcp(
type="mcp",
server_label="server1",
allowed_tools=["tool1"],
),
LocalShell(type="local_shell"), # Non-MCP tool should be ignored
Mcp(
type="mcp",
server_label="server2",
allowed_tools=["tool2"],
),
]
result = _extract_allowed_tools_from_mcp_requests(tools)
# Non-MCP tools should be ignored
assert result == {
"server1": ["tool1"],
"server2": ["tool2"],
}
class TestHarmonyPreambleStreaming:
"""Tests for preamble (commentary with no recipient) streaming events."""
@staticmethod
def _make_ctx(*, channel, recipient, delta="hello"):
"""Build a lightweight mock StreamingHarmonyContext."""
ctx = MagicMock()
ctx.last_content_delta = delta
ctx.parser.current_channel = channel
ctx.parser.current_recipient = recipient
return ctx
@staticmethod
def _make_previous_item(*, channel, recipient, text="preamble text"):
"""Build a lightweight mock previous_item (openai_harmony Message)."""
content_part = MagicMock()
content_part.text = text
item = MagicMock()
item.channel = channel
item.recipient = recipient
item.content = [content_part]
return item
def test_preamble_delta_emits_text_events(self) -> None:
"""commentary + recipient=None should emit output_text.delta events."""
from vllm.entrypoints.openai.responses.streaming_events import (
emit_content_delta_events,
)
ctx = self._make_ctx(channel="commentary", recipient=None)
state = StreamingState()
events = emit_content_delta_events(ctx, state)
type_names = [e.type for e in events]
assert "response.output_text.delta" in type_names
assert "response.output_item.added" in type_names
def test_preamble_delta_second_token_no_added(self) -> None:
"""Second preamble token should emit delta only, not added again."""
from vllm.entrypoints.openai.responses.streaming_events import (
emit_content_delta_events,
)
ctx = self._make_ctx(channel="commentary", recipient=None, delta="w")
state = StreamingState()
state.sent_output_item_added = True
state.current_item_id = "msg_test"
state.current_content_index = 0
events = emit_content_delta_events(ctx, state)
type_names = [e.type for e in events]
assert "response.output_text.delta" in type_names
assert "response.output_item.added" not in type_names
def test_commentary_with_function_recipient_not_preamble(self) -> None:
"""commentary + recipient='functions.X' must NOT use preamble path."""
from vllm.entrypoints.openai.responses.streaming_events import (
emit_content_delta_events,
)
ctx = self._make_ctx(
channel="commentary",
recipient="functions.get_weather",
)
state = StreamingState()
events = emit_content_delta_events(ctx, state)
type_names = [e.type for e in events]
assert "response.output_text.delta" not in type_names
def test_preamble_done_emits_text_done_events(self) -> None:
"""Completed preamble should emit text done + content_part done +
output_item done, same shape as final channel."""
from vllm.entrypoints.openai.responses.streaming_events import (
emit_previous_item_done_events,
)
previous = self._make_previous_item(channel="commentary", recipient=None)
state = StreamingState()
state.current_item_id = "msg_test"
state.current_output_index = 0
state.current_content_index = 0
events = emit_previous_item_done_events(previous, state)
type_names = [e.type for e in events]
assert "response.output_text.done" in type_names
assert "response.content_part.done" in type_names
assert "response.output_item.done" in type_names
def test_commentary_with_recipient_no_preamble_done(self) -> None:
"""commentary + recipient='functions.X' should route to function call
done, not preamble done."""
from vllm.entrypoints.openai.responses.streaming_events import (
emit_previous_item_done_events,
)
previous = self._make_previous_item(
channel="commentary", recipient="functions.get_weather"
)
state = StreamingState()
state.current_item_id = "fc_test"
events = emit_previous_item_done_events(previous, state)
type_names = [e.type for e in events]
assert "response.output_text.done" not in type_names