vllm/tests/entrypoints/openai/test_chat_error.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from dataclasses import dataclass, field
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
from vllm.v1.engine.async_llm import AsyncLLM

MODEL_NAME = "openai-community/gpt2"
MODEL_NAME_SHORT = "gpt2"
BASE_MODEL_PATHS = [
    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
]


@dataclass
class MockHFConfig:
    model_type: str = "any"


@dataclass
class MockModelConfig:
    task = "generate"
    runner_type = "generate"
    model = MODEL_NAME
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
    max_model_len = 100
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
    hf_text_config = MockHFConfig()
    logits_processors: list[str] | None = None
    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
    allowed_media_domains: list[str] | None = None
    encoder_config = None
    generation_config: str = "auto"
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    skip_tokenizer_init = False
    is_encoder_decoder: bool = False
    is_multimodal_model: bool = False

    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}


@dataclass
class MockParallelConfig:
    _api_process_rank: int = 0


@dataclass
class MockVllmConfig:
    model_config: MockModelConfig
    parallel_config: MockParallelConfig


def _build_renderer(model_config: MockModelConfig):
    _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)

    return HfRenderer.from_config(
        MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
        tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
    )


def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
    models = OpenAIServingModels(
        engine_client=engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
    serving_chat = OpenAIServingChat(
        engine,
        models,
        response_role="assistant",
        request_logger=None,
        chat_template=None,
        chat_template_content_format="auto",
    )

    async def _fake_preprocess_chat(*args, **kwargs):
        # return conversation, engine_prompts
        return (
            [{"role": "user", "content": "Test"}],
            [{"prompt_token_ids": [1, 2, 3]}],
        )

    serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
    return serving_chat


@pytest.mark.asyncio
async def test_chat_error_non_stream():
    """test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)

    completion_output = CompletionOutput(
        index=0,
        text="",
        token_ids=[],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason="error",
    )

    request_output = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output],
        finished=True,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )

    async def mock_generate(*args, **kwargs):
        yield request_output

    mock_engine.generate = MagicMock(side_effect=mock_generate)

    request = ChatCompletionRequest(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": "Test prompt"}],
        max_tokens=10,
        stream=False,
    )

    with pytest.raises(GenerationError):
        await serving_chat.create_chat_completion(request)


@pytest.mark.asyncio
async def test_chat_error_stream():
    """test finish_reason='error' returns 500 InternalServerError (streaming)"""
    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)

    completion_output_1 = CompletionOutput(
        index=0,
        text="Hello",
        token_ids=[100],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason=None,
    )

    request_output_1 = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output_1],
        finished=False,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )

    completion_output_2 = CompletionOutput(
        index=0,
        text="Hello",
        token_ids=[100],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason="error",
    )

    request_output_2 = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output_2],
        finished=True,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )

    async def mock_generate(*args, **kwargs):
        yield request_output_1
        yield request_output_2

    mock_engine.generate = MagicMock(side_effect=mock_generate)

    request = ChatCompletionRequest(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": "Test prompt"}],
        max_tokens=10,
        stream=True,
    )

    response = await serving_chat.create_chat_completion(request)

    chunks = []
    async for chunk in response:
        chunks.append(chunk)

    assert len(chunks) >= 2
    assert any("Internal server error" in chunk for chunk in chunks), (
        f"Expected error message in chunks: {chunks}"
    )
    assert chunks[-1] == "data: [DONE]\n\n"


@pytest.mark.parametrize(
    "image_content",
    [
        [{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}],
        [{"image_url": {"url": "https://example.com/image.jpg"}}],
    ],
)
def test_system_message_warns_on_image(image_content):
    """Test that system messages with image content trigger a warning."""
    with patch(
        "vllm.entrypoints.openai.chat_completion.protocol.logger"
    ) as mock_logger:
        ChatCompletionRequest(
            model=MODEL_NAME,
            messages=[
                {
                    "role": "system",
                    "content": image_content,
                }
            ],
        )

    mock_logger.warning_once.assert_called()
    call_args = str(mock_logger.warning_once.call_args)
    assert "System messages should only contain text" in call_args
    assert "image_url" in call_args


def test_system_message_accepts_text():
    """Test that system messages can contain text content."""
    # Should not raise an exception
    request = ChatCompletionRequest(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
        ],
    )
    assert request.messages[0]["role"] == "system"


def test_system_message_accepts_text_array():
    """Test that system messages can contain an array with text content."""
    # Should not raise an exception
    request = ChatCompletionRequest(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": [{"type": "text", "text": "You are a helpful assistant."}],
            },
        ],
    )
    assert request.messages[0]["role"] == "system"


def test_user_message_accepts_image():
    """Test that user messages can still contain image content."""
    # Should not raise an exception
    request = ChatCompletionRequest(
        model=MODEL_NAME,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {"url": "https://example.com/image.jpg"},
                    },
                ],
            },
        ],
    )
    assert request.messages[0]["role"] == "user"


@pytest.mark.parametrize(
    "audio_content",
    [
        [
            {
                "type": "input_audio",
                "input_audio": {"data": "base64data", "format": "wav"},
            }
        ],
        [{"input_audio": {"data": "base64data", "format": "wav"}}],
    ],
)
def test_system_message_warns_on_audio(audio_content):
    """Test that system messages with audio content trigger a warning."""
    with patch(
        "vllm.entrypoints.openai.chat_completion.protocol.logger"
    ) as mock_logger:
        ChatCompletionRequest(
            model=MODEL_NAME,
            messages=[
                {
                    "role": "system",
                    "content": audio_content,
                }
            ],
        )

    mock_logger.warning_once.assert_called()
    call_args = str(mock_logger.warning_once.call_args)
    assert "System messages should only contain text" in call_args
    assert "input_audio" in call_args


@pytest.mark.parametrize(
    "video_content",
    [
        [{"type": "video_url", "video_url": {"url": "https://example.com/video.mp4"}}],
        [{"video_url": {"url": "https://example.com/video.mp4"}}],
    ],
)
def test_system_message_warns_on_video(video_content):
    """Test that system messages with video content trigger a warning."""
    with patch(
        "vllm.entrypoints.openai.chat_completion.protocol.logger"
    ) as mock_logger:
        ChatCompletionRequest(
            model=MODEL_NAME,
            messages=[
                {
                    "role": "system",
                    "content": video_content,
                }
            ],
        )

    mock_logger.warning_once.assert_called()
    call_args = str(mock_logger.warning_once.call_args)
    assert "System messages should only contain text" in call_args
    assert "video_url" in call_args


def test_json_schema_response_format_missing_schema():
    """When response_format type is 'json_schema' but the json_schema field
    is not provided, request construction should raise a validation error
    so the API returns 400 instead of 500."""
    with pytest.raises(Exception, match="json_schema.*must be provided"):
        ChatCompletionRequest(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": "hello"}],
            response_format={"type": "json_schema"},
        )