[Refactor] [6/N] to simplify the vLLM openai chat_completion serving architecture (#32240)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
@@ -6,8 +6,8 @@ from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -7,7 +7,7 @@ import json
|
||||
import pytest
|
||||
from openai.types.responses import ResponseFunctionToolCall
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ResponsesRequest
|
||||
from vllm.entrypoints.openai.engine.protocol import ResponsesRequest
|
||||
|
||||
|
||||
def test_function_call_dict_converted_to_object():
|
||||
@@ -253,7 +253,7 @@ def test_function_call_validation_failure_logs_debug(caplog):
|
||||
}
|
||||
|
||||
# Mock the logger to verify debug was called
|
||||
with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
|
||||
with patch("vllm.entrypoints.openai.engine.protocol.logger") as mock_logger:
|
||||
with pytest.raises(ValueError):
|
||||
ResponsesRequest(**request_data)
|
||||
|
||||
|
||||
@@ -9,8 +9,9 @@ from unittest.mock import AsyncMock, MagicMock
|
||||
import pytest
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
@@ -5,7 +5,7 @@ import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
from ...models.registry import HF_EXAMPLE_MODELS
|
||||
|
||||
@@ -9,7 +9,7 @@ from unittest.mock import AsyncMock, MagicMock
|
||||
import pytest
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
|
||||
@@ -8,7 +8,7 @@ from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
StructuredOutputsParams,
|
||||
)
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
|
||||
@@ -9,7 +9,7 @@ from unittest.mock import AsyncMock, MagicMock
|
||||
import pytest
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
@@ -4,7 +4,10 @@ from openai_harmony import (
|
||||
Message,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
serialize_message,
|
||||
serialize_messages,
|
||||
)
|
||||
|
||||
|
||||
def test_serialize_message() -> None:
|
||||
|
||||
@@ -11,14 +11,16 @@ import pytest_asyncio
|
||||
from openai import OpenAI
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
RequestResponseMetadata,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
|
||||
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
@@ -1517,12 +1519,12 @@ class TestCreateRemainingArgsDelta:
|
||||
|
||||
def test_preserves_id_type_name(self):
|
||||
"""Test that id, type, and name are preserved from original delta."""
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaFunctionCall,
|
||||
DeltaMessage,
|
||||
DeltaToolCall,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
|
||||
original_delta = DeltaMessage(
|
||||
tool_calls=[
|
||||
@@ -1552,12 +1554,12 @@ class TestCreateRemainingArgsDelta:
|
||||
|
||||
def test_matches_by_index(self):
|
||||
"""Test that the correct tool call is matched by index."""
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaFunctionCall,
|
||||
DeltaMessage,
|
||||
DeltaToolCall,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
|
||||
original_delta = DeltaMessage(
|
||||
tool_calls=[
|
||||
@@ -1588,12 +1590,12 @@ class TestCreateRemainingArgsDelta:
|
||||
|
||||
def test_no_matching_tool_call(self):
|
||||
"""Test graceful handling when no matching tool call is found."""
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaFunctionCall,
|
||||
DeltaMessage,
|
||||
DeltaToolCall,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
|
||||
original_delta = DeltaMessage(
|
||||
tool_calls=[
|
||||
@@ -1620,8 +1622,8 @@ class TestCreateRemainingArgsDelta:
|
||||
|
||||
def test_function_is_none(self):
|
||||
"""Test handling when original tool call has no function."""
|
||||
from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall
|
||||
|
||||
original_delta = DeltaMessage(
|
||||
tool_calls=[
|
||||
|
||||
@@ -9,7 +9,7 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.serving_chat_stream_harmony import (
|
||||
from vllm.entrypoints.openai.chat_completion.stream_harmony import (
|
||||
extract_harmony_streaming_delta,
|
||||
)
|
||||
|
||||
@@ -82,7 +82,7 @@ class TestExtractHarmonyStreamingDelta:
|
||||
assert tools_streamed is False
|
||||
|
||||
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
|
||||
@patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
|
||||
@patch("vllm.entrypoints.openai.chat_completion.stream_harmony.make_tool_call_id")
|
||||
def test_new_tool_call(self, mock_make_tool_call_id, channel):
|
||||
"""Test new tool call creation when recipient changes."""
|
||||
mock_make_tool_call_id.return_value = "call_test123"
|
||||
|
||||
@@ -8,7 +8,7 @@ from unittest.mock import Mock
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
LoadLoRAAdapterRequest,
|
||||
UnloadLoRAAdapterRequest,
|
||||
|
||||
@@ -14,7 +14,7 @@ from openai.types.responses.tool import (
|
||||
)
|
||||
|
||||
from vllm.entrypoints.context import ConversationContext
|
||||
from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse, ResponsesRequest
|
||||
from vllm.entrypoints.openai.serving_responses import (
|
||||
OpenAIServingResponses,
|
||||
_extract_allowed_tools_from_mcp_requests,
|
||||
|
||||
@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.entrypoints.openai.engine.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import json
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
|
||||
from vllm.entrypoints.openai.engine.protocol import ExtractedToolCallInformation
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.entrypoints.openai.engine.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.entrypoints.openai.engine.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
|
||||
run_tool_extraction,
|
||||
run_tool_extraction_streaming,
|
||||
)
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall
|
||||
from vllm.entrypoints.openai.engine.protocol import FunctionCall
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers import ToolParser, ToolParserManager
|
||||
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
|
||||
from collections.abc import Iterable
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaMessage,
|
||||
ExtractedToolCallInformation,
|
||||
FunctionCall,
|
||||
|
||||
@@ -4,11 +4,13 @@ import json
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionResponseChoice,
|
||||
ChatCompletionStreamResponse,
|
||||
ChatMessage,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
UsageInfo,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user