[Refactor] [6/N] to simplify the vLLM openai chat_completion serving architecture (#32240)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey
2026-01-13 21:01:39 +08:00
committed by GitHub
parent a5bbbd2f24
commit fefce49807
128 changed files with 1221 additions and 1008 deletions

View File

@@ -6,8 +6,8 @@ from unittest.mock import MagicMock
import pytest
from vllm.entrypoints.openai.protocol import ErrorResponse
from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing
@pytest.mark.asyncio

View File

@@ -7,7 +7,7 @@ import json
import pytest
from openai.types.responses import ResponseFunctionToolCall
from vllm.entrypoints.openai.protocol import ResponsesRequest
from vllm.entrypoints.openai.engine.protocol import ResponsesRequest
def test_function_call_dict_converted_to_object():
@@ -253,7 +253,7 @@ def test_function_call_validation_failure_logs_debug(caplog):
}
# Mock the logger to verify debug was called
with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
with patch("vllm.entrypoints.openai.engine.protocol.logger") as mock_logger:
with pytest.raises(ValueError):
ResponsesRequest(**request_data)

View File

@@ -9,8 +9,9 @@ from unittest.mock import AsyncMock, MagicMock
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.tokenizers import get_tokenizer

View File

@@ -5,7 +5,7 @@ import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.tokenizers import get_tokenizer
from ...models.registry import HF_EXAMPLE_MODELS

View File

@@ -9,7 +9,7 @@ from unittest.mock import AsyncMock, MagicMock
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput

View File

@@ -8,7 +8,7 @@ from unittest.mock import Mock
import pytest
from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.engine.protocol import (
StructuredOutputsParams,
)
from vllm.entrypoints.tool_server import ToolServer

View File

@@ -9,7 +9,7 @@ from unittest.mock import AsyncMock, MagicMock
import pytest
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.lora.request import LoRARequest

View File

@@ -4,7 +4,10 @@ from openai_harmony import (
Message,
)
from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
from vllm.entrypoints.openai.engine.protocol import (
serialize_message,
serialize_messages,
)
def test_serialize_message() -> None:

View File

@@ -11,14 +11,16 @@ import pytest_asyncio
from openai import OpenAI
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.tokenizers import get_tokenizer
@@ -1517,12 +1519,12 @@ class TestCreateRemainingArgsDelta:
def test_preserves_id_type_name(self):
"""Test that id, type, and name are preserved from original delta."""
from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall,
DeltaMessage,
DeltaToolCall,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
original_delta = DeltaMessage(
tool_calls=[
@@ -1552,12 +1554,12 @@ class TestCreateRemainingArgsDelta:
def test_matches_by_index(self):
"""Test that the correct tool call is matched by index."""
from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall,
DeltaMessage,
DeltaToolCall,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
original_delta = DeltaMessage(
tool_calls=[
@@ -1588,12 +1590,12 @@ class TestCreateRemainingArgsDelta:
def test_no_matching_tool_call(self):
"""Test graceful handling when no matching tool call is found."""
from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall,
DeltaMessage,
DeltaToolCall,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
original_delta = DeltaMessage(
tool_calls=[
@@ -1620,8 +1622,8 @@ class TestCreateRemainingArgsDelta:
def test_function_is_none(self):
"""Test handling when original tool call has no function."""
from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall
original_delta = DeltaMessage(
tool_calls=[

View File

@@ -9,7 +9,7 @@ from unittest.mock import patch
import pytest
from vllm.entrypoints.openai.serving_chat_stream_harmony import (
from vllm.entrypoints.openai.chat_completion.stream_harmony import (
extract_harmony_streaming_delta,
)
@@ -82,7 +82,7 @@ class TestExtractHarmonyStreamingDelta:
assert tools_streamed is False
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
@patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
@patch("vllm.entrypoints.openai.chat_completion.stream_harmony.make_tool_call_id")
def test_new_tool_call(self, mock_make_tool_call_id, channel):
"""Test new tool call creation when recipient changes."""
mock_make_tool_call_id.return_value = "call_test123"

View File

@@ -8,7 +8,7 @@ from unittest.mock import Mock
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.tokenizers.mistral import MistralTokenizer

View File

@@ -8,7 +8,7 @@ import pytest
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,

View File

@@ -14,7 +14,7 @@ from openai.types.responses.tool import (
)
from vllm.entrypoints.context import ConversationContext
from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
from vllm.entrypoints.openai.engine.protocol import ErrorResponse, ResponsesRequest
from vllm.entrypoints.openai.serving_responses import (
OpenAIServingResponses,
_extract_allowed_tools_from_mcp_requests,

View File

@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.engine.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

View File

@@ -5,7 +5,7 @@ import json
import pytest
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser

View File

@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall
from vllm.tool_parsers import ToolParser, ToolParserManager

View File

@@ -5,7 +5,7 @@ from unittest.mock import MagicMock, patch
import pytest
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
from vllm.entrypoints.openai.engine.protocol import ExtractedToolCallInformation
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser

View File

@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.engine.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

View File

@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.engine.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

View File

@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction,
run_tool_extraction_streaming,
)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.engine.protocol import FunctionCall
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser, ToolParserManager

View File

@@ -3,8 +3,8 @@
from collections.abc import Iterable
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
ExtractedToolCallInformation,
FunctionCall,

View File

@@ -4,11 +4,13 @@ import json
from collections.abc import AsyncGenerator
from typing import Any
from vllm.entrypoints.openai.protocol import (
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionResponse,
ChatCompletionResponseChoice,
ChatCompletionStreamResponse,
ChatMessage,
)
from vllm.entrypoints.openai.engine.protocol import (
UsageInfo,
)