[responsesAPI] add better error messaging for long prompts (#25724)

Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
This commit is contained in:
Andrew Xia
2025-10-03 14:33:13 -07:00
committed by GitHub
parent c1ffcb55da
commit 831b124151
2 changed files with 84 additions and 1 deletions

View File

@@ -8,9 +8,10 @@ import pytest
import pytest_asyncio
from vllm.entrypoints.context import ConversationContext
from vllm.entrypoints.openai.protocol import ResponsesRequest
from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
class MockConversationContext(ConversationContext):
@@ -127,3 +128,63 @@ class TestInitializeToolSessions:
# Verify that init_tool_sessions was called
assert mock_context.init_tool_sessions_called
class TestValidateGeneratorInput:
"""Test class for _validate_generator_input method"""
@pytest_asyncio.fixture
async def serving_responses_instance(self):
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
engine_client.get_model_config = AsyncMock()
model_config = MagicMock()
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
models = MagicMock()
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
# Set max_model_len for testing
instance.max_model_len = 100
return instance
def test_validate_generator_input(self, serving_responses_instance):
"""Test _validate_generator_input with valid prompt length"""
# Create an engine prompt with valid length (less than max_model_len)
valid_prompt_token_ids = list(range(5)) # 5 tokens < 100 max_model_len
engine_prompt = EngineTokensPrompt(
prompt_token_ids=valid_prompt_token_ids)
# Call the method
result = serving_responses_instance._validate_generator_input(
engine_prompt)
# Should return None for valid input
assert result is None
# create an invalid engine prompt
invalid_prompt_token_ids = list(
range(200)) # 100 tokens >= 100 max_model_len
engine_prompt = EngineTokensPrompt(
prompt_token_ids=invalid_prompt_token_ids)
# Call the method
result = serving_responses_instance._validate_generator_input(
engine_prompt)
# Should return an ErrorResponse
assert result is not None
assert isinstance(result, ErrorResponse)