Files
vllm/tests/entrypoints/openai/test_chat_error.py
Chenguang Zheng fcb73f306c [bugfix] add api process rank in default multimodal request (#36150)
Signed-off-by: fake0fan <645327136@qq.com>
Signed-off-by: Chenguang ZHENG <645327136@qq.com>
2026-03-06 12:00:09 +00:00

386 lines
11 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, field
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
from vllm.v1.engine.async_llm import AsyncLLM
MODEL_NAME = "openai-community/gpt2"
MODEL_NAME_SHORT = "gpt2"
BASE_MODEL_PATHS = [
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
task = "generate"
runner_type = "generate"
model = MODEL_NAME
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
hf_text_config = MockHFConfig()
logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False
is_encoder_decoder: bool = False
is_multimodal_model: bool = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
@dataclass
class MockParallelConfig:
_api_process_rank: int = 0
@dataclass
class MockVllmConfig:
model_config: MockModelConfig
parallel_config: MockParallelConfig
def _build_renderer(model_config: MockModelConfig):
_, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)
return HfRenderer.from_config(
MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
)
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_chat = OpenAIServingChat(
engine,
models,
response_role="assistant",
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
async def _fake_preprocess_chat(*args, **kwargs):
# return conversation, engine_prompts
return (
[{"role": "user", "content": "Test"}],
[{"prompt_token_ids": [1, 2, 3]}],
)
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
return serving_chat
@pytest.mark.asyncio
async def test_chat_error_non_stream():
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)
serving_chat = _build_serving_chat(mock_engine)
completion_output = CompletionOutput(
index=0,
text="",
token_ids=[],
cumulative_logprob=None,
logprobs=None,
finish_reason="error",
)
request_output = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output],
finished=True,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
async def mock_generate(*args, **kwargs):
yield request_output
mock_engine.generate = MagicMock(side_effect=mock_generate)
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "Test prompt"}],
max_tokens=10,
stream=False,
)
with pytest.raises(GenerationError):
await serving_chat.create_chat_completion(request)
@pytest.mark.asyncio
async def test_chat_error_stream():
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)
serving_chat = _build_serving_chat(mock_engine)
completion_output_1 = CompletionOutput(
index=0,
text="Hello",
token_ids=[100],
cumulative_logprob=None,
logprobs=None,
finish_reason=None,
)
request_output_1 = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output_1],
finished=False,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
completion_output_2 = CompletionOutput(
index=0,
text="Hello",
token_ids=[100],
cumulative_logprob=None,
logprobs=None,
finish_reason="error",
)
request_output_2 = RequestOutput(
request_id="test-id",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[completion_output_2],
finished=True,
metrics=None,
lora_request=None,
encoder_prompt=None,
encoder_prompt_token_ids=None,
)
async def mock_generate(*args, **kwargs):
yield request_output_1
yield request_output_2
mock_engine.generate = MagicMock(side_effect=mock_generate)
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "Test prompt"}],
max_tokens=10,
stream=True,
)
response = await serving_chat.create_chat_completion(request)
chunks = []
async for chunk in response:
chunks.append(chunk)
assert len(chunks) >= 2
assert any("Internal server error" in chunk for chunk in chunks), (
f"Expected error message in chunks: {chunks}"
)
assert chunks[-1] == "data: [DONE]\n\n"
@pytest.mark.parametrize(
"image_content",
[
[{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}],
[{"image_url": {"url": "https://example.com/image.jpg"}}],
],
)
def test_system_message_warns_on_image(image_content):
"""Test that system messages with image content trigger a warning."""
with patch(
"vllm.entrypoints.openai.chat_completion.protocol.logger"
) as mock_logger:
ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": image_content,
}
],
)
mock_logger.warning_once.assert_called()
call_args = str(mock_logger.warning_once.call_args)
assert "System messages should only contain text" in call_args
assert "image_url" in call_args
def test_system_message_accepts_text():
"""Test that system messages can contain text content."""
# Should not raise an exception
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
],
)
assert request.messages[0]["role"] == "system"
def test_system_message_accepts_text_array():
"""Test that system messages can contain an array with text content."""
# Should not raise an exception
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}],
},
],
)
assert request.messages[0]["role"] == "system"
def test_user_message_accepts_image():
"""Test that user messages can still contain image content."""
# Should not raise an exception
request = ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.jpg"},
},
],
},
],
)
assert request.messages[0]["role"] == "user"
@pytest.mark.parametrize(
"audio_content",
[
[
{
"type": "input_audio",
"input_audio": {"data": "base64data", "format": "wav"},
}
],
[{"input_audio": {"data": "base64data", "format": "wav"}}],
],
)
def test_system_message_warns_on_audio(audio_content):
"""Test that system messages with audio content trigger a warning."""
with patch(
"vllm.entrypoints.openai.chat_completion.protocol.logger"
) as mock_logger:
ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": audio_content,
}
],
)
mock_logger.warning_once.assert_called()
call_args = str(mock_logger.warning_once.call_args)
assert "System messages should only contain text" in call_args
assert "input_audio" in call_args
@pytest.mark.parametrize(
"video_content",
[
[{"type": "video_url", "video_url": {"url": "https://example.com/video.mp4"}}],
[{"video_url": {"url": "https://example.com/video.mp4"}}],
],
)
def test_system_message_warns_on_video(video_content):
"""Test that system messages with video content trigger a warning."""
with patch(
"vllm.entrypoints.openai.chat_completion.protocol.logger"
) as mock_logger:
ChatCompletionRequest(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": video_content,
}
],
)
mock_logger.warning_once.assert_called()
call_args = str(mock_logger.warning_once.call_args)
assert "System messages should only contain text" in call_args
assert "video_url" in call_args
def test_json_schema_response_format_missing_schema():
"""When response_format type is 'json_schema' but the json_schema field
is not provided, request construction should raise a validation error
so the API returns 400 instead of 500."""
with pytest.raises(Exception, match="json_schema.*must be provided"):
ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "hello"}],
response_format={"type": "json_schema"},
)