[Frontend] Delegate preprocessing to OpenAIServingRender (#36483)
Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
This commit is contained in:
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.engine.protocol import GenerationError
|
||||
from vllm.entrypoints.openai.models.protocol import BaseModelPath
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.renderers.hf import HfRenderer
|
||||
from vllm.tokenizers.registry import tokenizer_args_from_config
|
||||
@@ -84,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
engine_client=engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
serving_render = OpenAIServingRender(
|
||||
model_config=engine.model_config,
|
||||
renderer=engine.renderer,
|
||||
io_processor=engine.io_processor,
|
||||
model_registry=models.registry,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
serving_chat = OpenAIServingChat(
|
||||
engine,
|
||||
models,
|
||||
response_role="assistant",
|
||||
openai_serving_render=serving_render,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
@@ -100,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
[{"prompt_token_ids": [1, 2, 3]}],
|
||||
)
|
||||
|
||||
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
|
||||
serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
|
||||
side_effect=_fake_preprocess_chat
|
||||
)
|
||||
return serving_chat
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.engine.protocol import GenerationError
|
||||
from vllm.entrypoints.openai.models.protocol import BaseModelPath
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.renderers.hf import HfRenderer
|
||||
from vllm.tokenizers.registry import tokenizer_args_from_config
|
||||
@@ -74,9 +75,19 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
|
||||
engine_client=engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
serving_render = OpenAIServingRender(
|
||||
model_config=engine.model_config,
|
||||
renderer=engine.renderer,
|
||||
io_processor=engine.io_processor,
|
||||
model_registry=models.registry,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
return OpenAIServingCompletion(
|
||||
engine,
|
||||
models,
|
||||
openai_serving_render=serving_render,
|
||||
request_logger=None,
|
||||
)
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.models.protocol import BaseModelPath
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
from vllm.renderers.hf import HfRenderer
|
||||
@@ -145,8 +146,17 @@ def mock_serving_setup():
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
|
||||
serving_render = OpenAIServingRender(
|
||||
model_config=mock_engine.model_config,
|
||||
renderer=mock_engine.renderer,
|
||||
io_processor=mock_engine.io_processor,
|
||||
model_registry=models.registry,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
serving_completion = OpenAIServingCompletion(
|
||||
mock_engine, models, request_logger=None
|
||||
mock_engine, models, openai_serving_render=serving_render, request_logger=None
|
||||
)
|
||||
|
||||
return mock_engine, serving_completion
|
||||
|
||||
@@ -21,8 +21,13 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorResponse,
|
||||
RequestResponseMetadata,
|
||||
)
|
||||
from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels
|
||||
from vllm.entrypoints.openai.models.serving import (
|
||||
BaseModelPath,
|
||||
OpenAIModelRegistry,
|
||||
OpenAIServingModels,
|
||||
)
|
||||
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
@@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig):
|
||||
)
|
||||
|
||||
|
||||
def _build_serving_render(
|
||||
engine, model_registry: OpenAIModelRegistry
|
||||
) -> OpenAIServingRender:
|
||||
return OpenAIServingRender(
|
||||
model_config=engine.model_config,
|
||||
renderer=engine.renderer,
|
||||
io_processor=engine.io_processor,
|
||||
model_registry=model_registry,
|
||||
request_logger=None,
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
|
||||
|
||||
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
models = OpenAIServingModels(
|
||||
engine_client=engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
openai_serving_render = _build_serving_render(engine, models.registry)
|
||||
|
||||
serving_chat = OpenAIServingChat(
|
||||
engine,
|
||||
models,
|
||||
response_role="assistant",
|
||||
openai_serving_render=openai_serving_render,
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None,
|
||||
@@ -586,10 +608,13 @@ async def _async_serving_chat_init():
|
||||
engine = MockEngine()
|
||||
|
||||
models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
|
||||
openai_serving_render = _build_serving_render(engine, models.registry)
|
||||
|
||||
serving_completion = OpenAIServingChat(
|
||||
engine,
|
||||
models,
|
||||
response_role="assistant",
|
||||
openai_serving_render=openai_serving_render,
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None,
|
||||
@@ -1182,7 +1207,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the first turn's input
|
||||
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
input_messages, _ = serving_chat._make_request_with_harmony(req)
|
||||
input_messages, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages,
|
||||
[
|
||||
@@ -1209,7 +1236,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
input_messages_2, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
[
|
||||
@@ -1230,7 +1259,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the first turn's input
|
||||
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages, _ = serving_chat._make_request_with_harmony(req)
|
||||
input_messages, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages,
|
||||
[
|
||||
@@ -1274,7 +1305,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
input_messages_2, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
[
|
||||
@@ -1311,7 +1344,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the first turn's input
|
||||
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages, _ = serving_chat._make_request_with_harmony(req)
|
||||
input_messages, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages,
|
||||
[
|
||||
@@ -1355,7 +1390,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
input_messages_2, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
[
|
||||
@@ -1392,7 +1429,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the first turn's input
|
||||
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages, _ = serving_chat._make_request_with_harmony(req)
|
||||
input_messages, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages,
|
||||
[
|
||||
@@ -1436,7 +1475,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
input_messages_2, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
[
|
||||
@@ -1486,7 +1527,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the third turn's input
|
||||
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
|
||||
input_messages_3, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req_3)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages_3,
|
||||
[
|
||||
@@ -1549,7 +1592,9 @@ class TestServingChatWithHarmony:
|
||||
|
||||
# Test the Harmony messages for the fourth turn's input
|
||||
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
|
||||
input_messages_4, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req_4)
|
||||
)
|
||||
verify_harmony_messages(
|
||||
input_messages_4,
|
||||
[
|
||||
@@ -1598,7 +1643,9 @@ class TestServingChatWithHarmony:
|
||||
},
|
||||
]
|
||||
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
input_messages, _ = serving_chat._make_request_with_harmony(req)
|
||||
input_messages, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req)
|
||||
)
|
||||
|
||||
verify_harmony_messages(
|
||||
input_messages,
|
||||
@@ -1629,7 +1676,9 @@ class TestServingChatWithHarmony:
|
||||
},
|
||||
]
|
||||
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
input_messages, _ = serving_chat._make_request_with_harmony(req)
|
||||
input_messages, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req)
|
||||
)
|
||||
|
||||
verify_harmony_messages(
|
||||
input_messages,
|
||||
@@ -1658,7 +1707,9 @@ class TestServingChatWithHarmony:
|
||||
},
|
||||
]
|
||||
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
input_messages, _ = serving_chat._make_request_with_harmony(req)
|
||||
input_messages, _ = (
|
||||
serving_chat.openai_serving_render._make_request_with_harmony(req)
|
||||
)
|
||||
|
||||
verify_harmony_messages(
|
||||
input_messages,
|
||||
@@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser():
|
||||
engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
openai_serving_render = _build_serving_render(mock_engine, models.registry)
|
||||
|
||||
# Create serving_chat without tool_parser (enable_auto_tools=False)
|
||||
serving_chat = OpenAIServingChat(
|
||||
mock_engine,
|
||||
models,
|
||||
response_role="assistant",
|
||||
openai_serving_render=openai_serving_render,
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None,
|
||||
|
||||
@@ -508,11 +508,25 @@ async def test_header_dp_rank_argument():
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
|
||||
# Create render serving instance (required by OpenAIServingChat)
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
|
||||
serving_render = OpenAIServingRender(
|
||||
model_config=engine.model_config,
|
||||
renderer=engine.renderer,
|
||||
io_processor=engine.io_processor,
|
||||
model_registry=models.registry,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
|
||||
# Create serving chat instance
|
||||
serving_chat = OpenAIServingChat(
|
||||
engine_client=engine,
|
||||
models=models,
|
||||
response_role="assistant",
|
||||
openai_serving_render=serving_render,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None,
|
||||
|
||||
@@ -10,7 +10,7 @@ import logging
|
||||
import time
|
||||
import uuid
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
@@ -43,6 +43,9 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -59,6 +62,7 @@ class AnthropicServingMessages(OpenAIServingChat):
|
||||
models: OpenAIServingModels,
|
||||
response_role: str,
|
||||
*,
|
||||
openai_serving_render: "OpenAIServingRender",
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
@@ -73,6 +77,7 @@ class AnthropicServingMessages(OpenAIServingChat):
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
response_role=response_role,
|
||||
openai_serving_render=openai_serving_render,
|
||||
request_logger=request_logger,
|
||||
chat_template=chat_template,
|
||||
chat_template_content_format=chat_template_content_format,
|
||||
|
||||
@@ -6,12 +6,11 @@ import json
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from collections.abc import Sequence as GenericSequence
|
||||
from typing import Any, Final
|
||||
from typing import TYPE_CHECKING, Any, Final
|
||||
|
||||
import partial_json_parser
|
||||
import regex as re
|
||||
from fastapi import Request
|
||||
from openai_harmony import Message as OpenAIMessage
|
||||
from partial_json_parser.core.options import Allow
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
@@ -56,17 +55,13 @@ from vllm.entrypoints.openai.engine.serving import (
|
||||
)
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.parser.harmony_utils import (
|
||||
get_developer_message,
|
||||
get_stop_tokens_for_assistant_actions,
|
||||
get_streamable_parser_for_assistant,
|
||||
get_system_message,
|
||||
parse_chat_inputs_to_harmony_messages,
|
||||
parse_chat_output,
|
||||
render_for_completion,
|
||||
)
|
||||
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
|
||||
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
||||
from vllm.inputs.data import ProcessorInputs, TokensPrompt
|
||||
from vllm.inputs.data import ProcessorInputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
@@ -80,7 +75,9 @@ from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
|
||||
from vllm.tool_parsers.utils import partial_json_loads
|
||||
from vllm.utils.collection_utils import as_list
|
||||
from vllm.utils.mistral import is_mistral_tokenizer
|
||||
from vllm.utils.mistral import mt as _mt
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -92,6 +89,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
models: OpenAIServingModels,
|
||||
response_role: str,
|
||||
*,
|
||||
openai_serving_render: "OpenAIServingRender",
|
||||
request_logger: RequestLogger | None,
|
||||
chat_template: str | None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
@@ -114,6 +112,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
)
|
||||
|
||||
self.openai_serving_render = openai_serving_render
|
||||
self.response_role = response_role
|
||||
self.chat_template = chat_template
|
||||
self.chat_template_content_format: Final = chat_template_content_format
|
||||
@@ -186,7 +185,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
request: ChatCompletionRequest,
|
||||
) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse:
|
||||
"""
|
||||
render chat request by validating and preprocessing inputs.
|
||||
Validate the model and preprocess a chat completion request.
|
||||
|
||||
Delegates preprocessing logic to OpenAIServingRender, adding the
|
||||
engine-aware checks (LoRA model validation, engine health).
|
||||
|
||||
Returns:
|
||||
A tuple of (conversation, engine_prompts) on success,
|
||||
@@ -203,78 +205,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
tokenizer = self.renderer.tokenizer
|
||||
|
||||
tool_parser = self.tool_parser
|
||||
|
||||
if is_mistral_tokenizer(tokenizer):
|
||||
# because of issues with pydantic we need to potentially
|
||||
# re-serialize the tool_calls field of the request
|
||||
# for more info: see comment in `maybe_serialize_tool_calls`
|
||||
_mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type]
|
||||
_mt.truncate_tool_call_ids(request) # type: ignore[arg-type]
|
||||
_mt.validate_request_params(request)
|
||||
|
||||
# Check if tool parsing is unavailable (common condition)
|
||||
tool_parsing_unavailable = (
|
||||
tool_parser is None
|
||||
and not is_mistral_tokenizer(tokenizer)
|
||||
and not self.use_harmony
|
||||
)
|
||||
|
||||
# Validate tool_choice when tool parsing is required but unavailable
|
||||
if tool_parsing_unavailable and request.tool_choice not in (
|
||||
None,
|
||||
"none",
|
||||
):
|
||||
if request.tool_choice == "auto" and not self.enable_auto_tools:
|
||||
# for hf tokenizers, "auto" tools requires
|
||||
# --enable-auto-tool-choice and --tool-call-parser
|
||||
return self.create_error_response(
|
||||
'"auto" tool choice requires '
|
||||
"--enable-auto-tool-choice and --tool-call-parser to be set"
|
||||
)
|
||||
elif request.tool_choice != "auto":
|
||||
# "required" or named tool requires tool parser
|
||||
return self.create_error_response(
|
||||
f'tool_choice="{request.tool_choice}" requires '
|
||||
"--tool-call-parser to be set"
|
||||
)
|
||||
|
||||
if request.tools is None or (
|
||||
request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
|
||||
):
|
||||
tool_dicts = None
|
||||
else:
|
||||
tool_dicts = [tool.model_dump() for tool in request.tools]
|
||||
|
||||
if not self.use_harmony:
|
||||
# Common case.
|
||||
error_check_ret = self._validate_chat_template(
|
||||
request_chat_template=request.chat_template,
|
||||
chat_template_kwargs=request.chat_template_kwargs,
|
||||
trust_request_chat_template=self.trust_request_chat_template,
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
conversation, engine_prompts = await self._preprocess_chat(
|
||||
request,
|
||||
request.messages,
|
||||
default_template=self.chat_template,
|
||||
default_template_content_format=self.chat_template_content_format,
|
||||
default_template_kwargs=self.default_chat_template_kwargs,
|
||||
tool_dicts=tool_dicts,
|
||||
tool_parser=tool_parser,
|
||||
)
|
||||
else:
|
||||
# For GPT-OSS.
|
||||
should_include_tools = tool_dicts is not None
|
||||
conversation, engine_prompts = self._make_request_with_harmony(
|
||||
request, should_include_tools
|
||||
)
|
||||
|
||||
return conversation, engine_prompts
|
||||
return await self.openai_serving_render.render_chat(request)
|
||||
|
||||
async def create_chat_completion(
|
||||
self,
|
||||
@@ -1875,50 +1806,3 @@ class OpenAIServingChat(OpenAIServing):
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def _make_request_with_harmony(
|
||||
self,
|
||||
request: ChatCompletionRequest,
|
||||
should_include_tools: bool = True,
|
||||
):
|
||||
messages: list[OpenAIMessage] = []
|
||||
|
||||
# because of issues with pydantic we need to potentially
|
||||
# re-serialize the tool_calls field of the request
|
||||
# for more info: see comment in `maybe_serialize_tool_calls`
|
||||
_mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type]
|
||||
|
||||
# Add system message.
|
||||
# NOTE: In Chat Completion API, browsing is enabled by default
|
||||
# if the model supports it. TODO: Support browsing.
|
||||
assert not self.supports_browsing
|
||||
assert not self.supports_code_interpreter
|
||||
if (reasoning_effort := request.reasoning_effort) == "none":
|
||||
raise ValueError(f"Harmony does not support {reasoning_effort=}")
|
||||
sys_msg = get_system_message(
|
||||
reasoning_effort=reasoning_effort,
|
||||
browser_description=None,
|
||||
python_description=None,
|
||||
with_custom_tools=should_include_tools,
|
||||
)
|
||||
messages.append(sys_msg)
|
||||
|
||||
# Add developer message.
|
||||
if request.tools:
|
||||
dev_msg = get_developer_message(
|
||||
tools=request.tools if should_include_tools else None # type: ignore[arg-type]
|
||||
)
|
||||
messages.append(dev_msg)
|
||||
|
||||
# Add user message.
|
||||
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
|
||||
|
||||
# Render prompt token ids.
|
||||
prompt_token_ids = render_for_completion(messages)
|
||||
engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
|
||||
|
||||
# Add cache_salt if provided in the request
|
||||
if request.cache_salt is not None:
|
||||
engine_prompt["cache_salt"] = request.cache_salt
|
||||
|
||||
return messages, [engine_prompt]
|
||||
|
||||
@@ -5,7 +5,7 @@ import asyncio
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from collections.abc import Sequence as GenericSequence
|
||||
from typing import cast
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
@@ -42,6 +42,9 @@ from vllm.tokenizers import TokenizerLike
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
from vllm.utils.collection_utils import as_list
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -51,6 +54,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
engine_client: EngineClient,
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
openai_serving_render: "OpenAIServingRender",
|
||||
request_logger: RequestLogger | None,
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
@@ -63,6 +67,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
)
|
||||
|
||||
self.openai_serving_render = openai_serving_render
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
self.enable_force_include_usage = enable_force_include_usage
|
||||
|
||||
@@ -79,7 +84,10 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
request: CompletionRequest,
|
||||
) -> list[ProcessorInputs] | ErrorResponse:
|
||||
"""
|
||||
render completion request by validating and preprocessing inputs.
|
||||
Validate the model and preprocess a completion request.
|
||||
|
||||
Delegates preprocessing logic to OpenAIServingRender, adding the
|
||||
engine-aware checks (LoRA model validation, engine health).
|
||||
|
||||
Returns:
|
||||
A list of engine_prompts on success,
|
||||
@@ -95,25 +103,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
# Return error for unsupported features.
|
||||
if request.suffix is not None:
|
||||
return self.create_error_response("suffix is not currently supported")
|
||||
|
||||
if request.echo and request.prompt_embeds is not None:
|
||||
return self.create_error_response("Echo is unsupported with prompt embeds.")
|
||||
|
||||
if request.prompt_logprobs is not None and request.prompt_embeds is not None:
|
||||
return self.create_error_response(
|
||||
"prompt_logprobs is not compatible with prompt embeds."
|
||||
)
|
||||
|
||||
engine_prompts = await self._preprocess_completion(
|
||||
request,
|
||||
prompt_input=request.prompt,
|
||||
prompt_embeds=request.prompt_embeds,
|
||||
)
|
||||
|
||||
return engine_prompts
|
||||
return await self.openai_serving_render.render_completion(request)
|
||||
|
||||
async def create_completion(
|
||||
self,
|
||||
|
||||
@@ -72,6 +72,29 @@ async def init_generate_state(
|
||||
tool_server = None
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
|
||||
# Render endpoints are always backed by OpenAIServingRender so that
|
||||
# /v1/chat/completions/render and /v1/completions/render work on both
|
||||
# generate-mode and render-only servers.
|
||||
# It is created first so that OpenAIServingChat and OpenAIServingCompletion
|
||||
# can delegate their preprocessing logic to it.
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
|
||||
state.openai_serving_render = OpenAIServingRender(
|
||||
model_config=engine_client.model_config,
|
||||
renderer=engine_client.renderer,
|
||||
io_processor=engine_client.io_processor,
|
||||
model_registry=state.openai_serving_models.registry,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
trust_request_chat_template=args.trust_request_chat_template,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
default_chat_template_kwargs=args.default_chat_template_kwargs,
|
||||
log_error_stack=args.log_error_stack,
|
||||
)
|
||||
|
||||
state.openai_serving_responses = (
|
||||
OpenAIServingResponses(
|
||||
engine_client,
|
||||
@@ -96,6 +119,7 @@ async def init_generate_state(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
args.response_role,
|
||||
openai_serving_render=state.openai_serving_render,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
@@ -120,6 +144,7 @@ async def init_generate_state(
|
||||
OpenAIServingCompletion(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
openai_serving_render=state.openai_serving_render,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
@@ -133,6 +158,7 @@ async def init_generate_state(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
args.response_role,
|
||||
openai_serving_render=state.openai_serving_render,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
@@ -159,24 +185,3 @@ async def init_generate_state(
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
|
||||
# Render endpoints are always backed by OpenAIServingRender so that
|
||||
# /v1/chat/completions/render and /v1/completions/render work on both
|
||||
# generate-mode and render-only servers.
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
|
||||
state.openai_serving_render = OpenAIServingRender(
|
||||
model_config=engine_client.model_config,
|
||||
renderer=engine_client.renderer,
|
||||
io_processor=engine_client.io_processor,
|
||||
model_registry=state.openai_serving_models.registry,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
trust_request_chat_template=args.trust_request_chat_template,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
default_chat_template_kwargs=args.default_chat_template_kwargs,
|
||||
log_error_stack=args.log_error_stack,
|
||||
)
|
||||
|
||||
@@ -87,15 +87,26 @@ class OpenAIServingRender:
|
||||
self,
|
||||
request: ChatCompletionRequest,
|
||||
) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse:
|
||||
"""Copied from OpenAIServingChat.render_chat_request.
|
||||
"""Validate the model and preprocess a chat completion request.
|
||||
|
||||
Differences: engine_client.errored check removed (no engine client).
|
||||
This is the authoritative implementation used directly by the
|
||||
GPU-less render server and delegated to by OpenAIServingChat.
|
||||
"""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
logger.error("Error with model %s", error_check_ret)
|
||||
return error_check_ret
|
||||
return await self.render_chat(request)
|
||||
|
||||
async def render_chat(
|
||||
self,
|
||||
request: ChatCompletionRequest,
|
||||
) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse:
|
||||
"""Core preprocessing logic for chat requests (no model/engine check).
|
||||
|
||||
Called directly by render_chat_request and delegated to by
|
||||
OpenAIServingChat.render_chat_request after its engine-aware checks.
|
||||
"""
|
||||
tokenizer = self.renderer.tokenizer
|
||||
|
||||
tool_parser = self.tool_parser
|
||||
@@ -173,14 +184,25 @@ class OpenAIServingRender:
|
||||
self,
|
||||
request: CompletionRequest,
|
||||
) -> list[ProcessorInputs] | ErrorResponse:
|
||||
"""Copied from OpenAIServingCompletion.render_completion_request.
|
||||
"""Validate the model and preprocess a completion request.
|
||||
|
||||
Differences: engine_client.errored check removed (no engine client).
|
||||
This is the authoritative implementation used directly by the
|
||||
GPU-less render server and delegated to by OpenAIServingCompletion.
|
||||
"""
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
return await self.render_completion(request)
|
||||
|
||||
async def render_completion(
|
||||
self,
|
||||
request: CompletionRequest,
|
||||
) -> list[ProcessorInputs] | ErrorResponse:
|
||||
"""Core preprocessing logic for completion requests (no model/engine check).
|
||||
|
||||
Called directly by render_completion_request and delegated to by
|
||||
OpenAIServingCompletion.render_completion_request after its engine-aware checks.
|
||||
"""
|
||||
# Return error for unsupported features.
|
||||
if request.suffix is not None:
|
||||
return self.create_error_response("suffix is not currently supported")
|
||||
@@ -206,7 +228,7 @@ class OpenAIServingRender:
|
||||
request: ChatCompletionRequest,
|
||||
should_include_tools: bool = True,
|
||||
):
|
||||
"""Copied from OpenAIServingChat._make_request_with_harmony."""
|
||||
"""Build Harmony (GPT-OSS) messages and engine prompt from a chat request."""
|
||||
messages: list[OpenAIMessage] = []
|
||||
|
||||
# because of issues with pydantic we need to potentially
|
||||
@@ -219,11 +241,10 @@ class OpenAIServingRender:
|
||||
# if the model supports it. TODO: Support browsing.
|
||||
assert not self.supports_browsing
|
||||
assert not self.supports_code_interpreter
|
||||
assert request.reasoning_effort != "none", (
|
||||
"Harmony does not support reasoning_effort='none'"
|
||||
)
|
||||
if (reasoning_effort := request.reasoning_effort) == "none":
|
||||
raise ValueError(f"Harmony does not support {reasoning_effort=}")
|
||||
sys_msg = get_system_message(
|
||||
reasoning_effort=request.reasoning_effort,
|
||||
reasoning_effort=reasoning_effort,
|
||||
browser_description=None,
|
||||
python_description=None,
|
||||
with_custom_tools=should_include_tools,
|
||||
|
||||
Reference in New Issue
Block a user