[Frontend] Delegate preprocessing to OpenAIServingRender (#36483)

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
This commit is contained in:
Sage
2026-03-13 09:39:43 +02:00
committed by GitHub
parent a4ad9db541
commit a2268617cf
10 changed files with 203 additions and 196 deletions

View File

@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
@@ -84,10 +85,20 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_render = OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
serving_chat = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=serving_render,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
@@ -100,7 +111,9 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
[{"prompt_token_ids": [1, 2, 3]}],
)
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
side_effect=_fake_preprocess_chat
)
return serving_chat

View File

@@ -13,6 +13,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import GenerationError
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer
from vllm.tokenizers.registry import tokenizer_args_from_config
@@ -74,9 +75,19 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
serving_render = OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
return OpenAIServingCompletion(
engine,
models,
openai_serving_render=serving_render,
request_logger=None,
)

View File

@@ -14,6 +14,7 @@ from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
from vllm.renderers.hf import HfRenderer
@@ -145,8 +146,17 @@ def mock_serving_setup():
base_model_paths=BASE_MODEL_PATHS,
)
serving_render = OpenAIServingRender(
model_config=mock_engine.model_config,
renderer=mock_engine.renderer,
io_processor=mock_engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
serving_completion = OpenAIServingCompletion(
mock_engine, models, request_logger=None
mock_engine, models, openai_serving_render=serving_render, request_logger=None
)
return mock_engine, serving_completion

View File

@@ -21,8 +21,13 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.models.serving import (
BaseModelPath,
OpenAIModelRegistry,
OpenAIServingModels,
)
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.exceptions import VLLMValidationError
from vllm.inputs import TokensPrompt
from vllm.outputs import CompletionOutput, RequestOutput
@@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig):
)
def _build_serving_render(
engine, model_registry: OpenAIModelRegistry
) -> OpenAIServingRender:
return OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=model_registry,
request_logger=None,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
)
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
openai_serving_render = _build_serving_render(engine, models.registry)
serving_chat = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
@@ -586,10 +608,13 @@ async def _async_serving_chat_init():
engine = MockEngine()
models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
openai_serving_render = _build_serving_render(engine, models.registry)
serving_completion = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
@@ -1182,7 +1207,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1209,7 +1236,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1230,7 +1259,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1274,7 +1305,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1311,7 +1344,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1355,7 +1390,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1392,7 +1429,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1436,7 +1475,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1486,7 +1527,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
input_messages_3, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_3)
)
verify_harmony_messages(
input_messages_3,
[
@@ -1549,7 +1592,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
input_messages_4, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_4)
)
verify_harmony_messages(
input_messages_4,
[
@@ -1598,7 +1643,9 @@ class TestServingChatWithHarmony:
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
@@ -1629,7 +1676,9 @@ class TestServingChatWithHarmony:
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
@@ -1658,7 +1707,9 @@ class TestServingChatWithHarmony:
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
@@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser():
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
)
openai_serving_render = _build_serving_render(mock_engine, models.registry)
# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat = OpenAIServingChat(
mock_engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,

View File

@@ -508,11 +508,25 @@ async def test_header_dp_rank_argument():
base_model_paths=BASE_MODEL_PATHS,
)
# Create render serving instance (required by OpenAIServingChat)
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
serving_render = OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=models.registry,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
)
# Create serving chat instance
serving_chat = OpenAIServingChat(
engine_client=engine,
models=models,
response_role="assistant",
openai_serving_render=serving_render,
chat_template=None,
chat_template_content_format="auto",
request_logger=None,

View File

@@ -10,7 +10,7 @@ import logging
import time
import uuid
from collections.abc import AsyncGenerator
from typing import Any
from typing import TYPE_CHECKING, Any
from fastapi import Request
@@ -43,6 +43,9 @@ from vllm.entrypoints.openai.engine.protocol import (
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
if TYPE_CHECKING:
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
logger = logging.getLogger(__name__)
@@ -59,6 +62,7 @@ class AnthropicServingMessages(OpenAIServingChat):
models: OpenAIServingModels,
response_role: str,
*,
openai_serving_render: "OpenAIServingRender",
request_logger: RequestLogger | None,
chat_template: str | None,
chat_template_content_format: ChatTemplateContentFormatOption,
@@ -73,6 +77,7 @@ class AnthropicServingMessages(OpenAIServingChat):
engine_client=engine_client,
models=models,
response_role=response_role,
openai_serving_render=openai_serving_render,
request_logger=request_logger,
chat_template=chat_template,
chat_template_content_format=chat_template_content_format,

View File

@@ -6,12 +6,11 @@ import json
import time
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Any, Final
from typing import TYPE_CHECKING, Any, Final
import partial_json_parser
import regex as re
from fastapi import Request
from openai_harmony import Message as OpenAIMessage
from partial_json_parser.core.options import Allow
from vllm.engine.protocol import EngineClient
@@ -56,17 +55,13 @@ from vllm.entrypoints.openai.engine.serving import (
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.parser.harmony_utils import (
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_streamable_parser_for_assistant,
get_system_message,
parse_chat_inputs_to_harmony_messages,
parse_chat_output,
render_for_completion,
)
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.inputs.data import ProcessorInputs, TokensPrompt
from vllm.inputs.data import ProcessorInputs
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import CompletionOutput, RequestOutput
@@ -80,7 +75,9 @@ from vllm.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.tool_parsers.utils import partial_json_loads
from vllm.utils.collection_utils import as_list
from vllm.utils.mistral import is_mistral_tokenizer
from vllm.utils.mistral import mt as _mt
if TYPE_CHECKING:
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
logger = init_logger(__name__)
@@ -92,6 +89,7 @@ class OpenAIServingChat(OpenAIServing):
models: OpenAIServingModels,
response_role: str,
*,
openai_serving_render: "OpenAIServingRender",
request_logger: RequestLogger | None,
chat_template: str | None,
chat_template_content_format: ChatTemplateContentFormatOption,
@@ -114,6 +112,7 @@ class OpenAIServingChat(OpenAIServing):
return_tokens_as_token_ids=return_tokens_as_token_ids,
)
self.openai_serving_render = openai_serving_render
self.response_role = response_role
self.chat_template = chat_template
self.chat_template_content_format: Final = chat_template_content_format
@@ -186,7 +185,10 @@ class OpenAIServingChat(OpenAIServing):
request: ChatCompletionRequest,
) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse:
"""
render chat request by validating and preprocessing inputs.
Validate the model and preprocess a chat completion request.
Delegates preprocessing logic to OpenAIServingRender, adding the
engine-aware checks (LoRA model validation, engine health).
Returns:
A tuple of (conversation, engine_prompts) on success,
@@ -203,78 +205,7 @@ class OpenAIServingChat(OpenAIServing):
if self.engine_client.errored:
raise self.engine_client.dead_error
tokenizer = self.renderer.tokenizer
tool_parser = self.tool_parser
if is_mistral_tokenizer(tokenizer):
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type]
_mt.truncate_tool_call_ids(request) # type: ignore[arg-type]
_mt.validate_request_params(request)
# Check if tool parsing is unavailable (common condition)
tool_parsing_unavailable = (
tool_parser is None
and not is_mistral_tokenizer(tokenizer)
and not self.use_harmony
)
# Validate tool_choice when tool parsing is required but unavailable
if tool_parsing_unavailable and request.tool_choice not in (
None,
"none",
):
if request.tool_choice == "auto" and not self.enable_auto_tools:
# for hf tokenizers, "auto" tools requires
# --enable-auto-tool-choice and --tool-call-parser
return self.create_error_response(
'"auto" tool choice requires '
"--enable-auto-tool-choice and --tool-call-parser to be set"
)
elif request.tool_choice != "auto":
# "required" or named tool requires tool parser
return self.create_error_response(
f'tool_choice="{request.tool_choice}" requires '
"--tool-call-parser to be set"
)
if request.tools is None or (
request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
):
tool_dicts = None
else:
tool_dicts = [tool.model_dump() for tool in request.tools]
if not self.use_harmony:
# Common case.
error_check_ret = self._validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
)
if error_check_ret is not None:
return error_check_ret
conversation, engine_prompts = await self._preprocess_chat(
request,
request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=self.default_chat_template_kwargs,
tool_dicts=tool_dicts,
tool_parser=tool_parser,
)
else:
# For GPT-OSS.
should_include_tools = tool_dicts is not None
conversation, engine_prompts = self._make_request_with_harmony(
request, should_include_tools
)
return conversation, engine_prompts
return await self.openai_serving_render.render_chat(request)
async def create_chat_completion(
self,
@@ -1875,50 +1806,3 @@ class OpenAIServingChat(OpenAIServing):
)
]
)
def _make_request_with_harmony(
self,
request: ChatCompletionRequest,
should_include_tools: bool = True,
):
messages: list[OpenAIMessage] = []
# because of issues with pydantic we need to potentially
# re-serialize the tool_calls field of the request
# for more info: see comment in `maybe_serialize_tool_calls`
_mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type]
# Add system message.
# NOTE: In Chat Completion API, browsing is enabled by default
# if the model supports it. TODO: Support browsing.
assert not self.supports_browsing
assert not self.supports_code_interpreter
if (reasoning_effort := request.reasoning_effort) == "none":
raise ValueError(f"Harmony does not support {reasoning_effort=}")
sys_msg = get_system_message(
reasoning_effort=reasoning_effort,
browser_description=None,
python_description=None,
with_custom_tools=should_include_tools,
)
messages.append(sys_msg)
# Add developer message.
if request.tools:
dev_msg = get_developer_message(
tools=request.tools if should_include_tools else None # type: ignore[arg-type]
)
messages.append(dev_msg)
# Add user message.
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
# Render prompt token ids.
prompt_token_ids = render_for_completion(messages)
engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
# Add cache_salt if provided in the request
if request.cache_salt is not None:
engine_prompt["cache_salt"] = request.cache_salt
return messages, [engine_prompt]

View File

@@ -5,7 +5,7 @@ import asyncio
import time
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import cast
from typing import TYPE_CHECKING, cast
from fastapi import Request
@@ -42,6 +42,9 @@ from vllm.tokenizers import TokenizerLike
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
if TYPE_CHECKING:
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
logger = init_logger(__name__)
@@ -51,6 +54,7 @@ class OpenAIServingCompletion(OpenAIServing):
engine_client: EngineClient,
models: OpenAIServingModels,
*,
openai_serving_render: "OpenAIServingRender",
request_logger: RequestLogger | None,
return_tokens_as_token_ids: bool = False,
enable_prompt_tokens_details: bool = False,
@@ -63,6 +67,7 @@ class OpenAIServingCompletion(OpenAIServing):
return_tokens_as_token_ids=return_tokens_as_token_ids,
)
self.openai_serving_render = openai_serving_render
self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.enable_force_include_usage = enable_force_include_usage
@@ -79,7 +84,10 @@ class OpenAIServingCompletion(OpenAIServing):
request: CompletionRequest,
) -> list[ProcessorInputs] | ErrorResponse:
"""
render completion request by validating and preprocessing inputs.
Validate the model and preprocess a completion request.
Delegates preprocessing logic to OpenAIServingRender, adding the
engine-aware checks (LoRA model validation, engine health).
Returns:
A list of engine_prompts on success,
@@ -95,25 +103,7 @@ class OpenAIServingCompletion(OpenAIServing):
if self.engine_client.errored:
raise self.engine_client.dead_error
# Return error for unsupported features.
if request.suffix is not None:
return self.create_error_response("suffix is not currently supported")
if request.echo and request.prompt_embeds is not None:
return self.create_error_response("Echo is unsupported with prompt embeds.")
if request.prompt_logprobs is not None and request.prompt_embeds is not None:
return self.create_error_response(
"prompt_logprobs is not compatible with prompt embeds."
)
engine_prompts = await self._preprocess_completion(
request,
prompt_input=request.prompt,
prompt_embeds=request.prompt_embeds,
)
return engine_prompts
return await self.openai_serving_render.render_completion(request)
async def create_completion(
self,

View File

@@ -72,6 +72,29 @@ async def init_generate_state(
tool_server = None
resolved_chat_template = load_chat_template(args.chat_template)
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
# It is created first so that OpenAIServingChat and OpenAIServingCompletion
# can delegate their preprocessing logic to it.
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
state.openai_serving_render = OpenAIServingRender(
model_config=engine_client.model_config,
renderer=engine_client.renderer,
io_processor=engine_client.io_processor,
model_registry=state.openai_serving_models.registry,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
state.openai_serving_responses = (
OpenAIServingResponses(
engine_client,
@@ -96,6 +119,7 @@ async def init_generate_state(
engine_client,
state.openai_serving_models,
args.response_role,
openai_serving_render=state.openai_serving_render,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
@@ -120,6 +144,7 @@ async def init_generate_state(
OpenAIServingCompletion(
engine_client,
state.openai_serving_models,
openai_serving_render=state.openai_serving_render,
request_logger=request_logger,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
@@ -133,6 +158,7 @@ async def init_generate_state(
engine_client,
state.openai_serving_models,
args.response_role,
openai_serving_render=state.openai_serving_render,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
@@ -159,24 +185,3 @@ async def init_generate_state(
if "generate" in supported_tasks
else None
)
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
state.openai_serving_render = OpenAIServingRender(
model_config=engine_client.model_config,
renderer=engine_client.renderer,
io_processor=engine_client.io_processor,
model_registry=state.openai_serving_models.registry,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)

View File

@@ -87,15 +87,26 @@ class OpenAIServingRender:
self,
request: ChatCompletionRequest,
) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse:
"""Copied from OpenAIServingChat.render_chat_request.
"""Validate the model and preprocess a chat completion request.
Differences: engine_client.errored check removed (no engine client).
This is the authoritative implementation used directly by the
GPU-less render server and delegated to by OpenAIServingChat.
"""
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
logger.error("Error with model %s", error_check_ret)
return error_check_ret
return await self.render_chat(request)
async def render_chat(
self,
request: ChatCompletionRequest,
) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse:
"""Core preprocessing logic for chat requests (no model/engine check).
Called directly by render_chat_request and delegated to by
OpenAIServingChat.render_chat_request after its engine-aware checks.
"""
tokenizer = self.renderer.tokenizer
tool_parser = self.tool_parser
@@ -173,14 +184,25 @@ class OpenAIServingRender:
self,
request: CompletionRequest,
) -> list[ProcessorInputs] | ErrorResponse:
"""Copied from OpenAIServingCompletion.render_completion_request.
"""Validate the model and preprocess a completion request.
Differences: engine_client.errored check removed (no engine client).
This is the authoritative implementation used directly by the
GPU-less render server and delegated to by OpenAIServingCompletion.
"""
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
return error_check_ret
return await self.render_completion(request)
async def render_completion(
self,
request: CompletionRequest,
) -> list[ProcessorInputs] | ErrorResponse:
"""Core preprocessing logic for completion requests (no model/engine check).
Called directly by render_completion_request and delegated to by
OpenAIServingCompletion.render_completion_request after its engine-aware checks.
"""
# Return error for unsupported features.
if request.suffix is not None:
return self.create_error_response("suffix is not currently supported")
@@ -206,7 +228,7 @@ class OpenAIServingRender:
request: ChatCompletionRequest,
should_include_tools: bool = True,
):
"""Copied from OpenAIServingChat._make_request_with_harmony."""
"""Build Harmony (GPT-OSS) messages and engine prompt from a chat request."""
messages: list[OpenAIMessage] = []
# because of issues with pydantic we need to potentially
@@ -219,11 +241,10 @@ class OpenAIServingRender:
# if the model supports it. TODO: Support browsing.
assert not self.supports_browsing
assert not self.supports_code_interpreter
assert request.reasoning_effort != "none", (
"Harmony does not support reasoning_effort='none'"
)
if (reasoning_effort := request.reasoning_effort) == "none":
raise ValueError(f"Harmony does not support {reasoning_effort=}")
sys_msg = get_system_message(
reasoning_effort=request.reasoning_effort,
reasoning_effort=reasoning_effort,
browser_description=None,
python_description=None,
with_custom_tools=should_include_tools,