[Frontend] Delegate tokenization serving preprocessing to OpenAIServingRender (#37266)

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
This commit is contained in:
Sage
2026-03-17 13:22:54 +02:00
committed by GitHub
parent 4af9ed21cb
commit 00f8e0d211
5 changed files with 33 additions and 30 deletions

View File

@@ -111,7 +111,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
[{"prompt_token_ids": [1, 2, 3]}],
)
serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
serving_chat.openai_serving_render.preprocess_chat = AsyncMock(
side_effect=_fake_preprocess_chat
)
return serving_chat

View File

@@ -46,6 +46,7 @@ from vllm.entrypoints.sagemaker.api_router import sagemaker_standards_bootstrap
from vllm.entrypoints.serve.elastic_ep.middleware import (
ScalingMiddleware,
)
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization
from vllm.entrypoints.utils import (
cli_env_setup,
@@ -365,9 +366,27 @@ async def init_app_state(
lora_modules=lora_modules,
)
await state.openai_serving_models.init_static_loras()
state.openai_serving_render = OpenAIServingRender(
model_config=engine_client.model_config,
renderer=engine_client.renderer,
io_processor=engine_client.io_processor,
model_registry=state.openai_serving_models.registry,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client,
state.openai_serving_models,
state.openai_serving_render,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,

View File

@@ -74,26 +74,7 @@ async def init_generate_state(
# Render endpoints are always backed by OpenAIServingRender so that
# /v1/chat/completions/render and /v1/completions/render work on both
# generate-mode and render-only servers.
# It is created first so that OpenAIServingChat and OpenAIServingCompletion
# can delegate their preprocessing logic to it.
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
state.openai_serving_render = OpenAIServingRender(
model_config=engine_client.model_config,
renderer=engine_client.renderer,
io_processor=engine_client.io_processor,
model_registry=state.openai_serving_models.registry,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
# generate-mode and render-only servers. Created in init_app_state.
state.openai_serving_responses = (
OpenAIServingResponses(

View File

@@ -226,7 +226,7 @@ class OpenAIServingRender:
if not self.use_harmony:
# Common case.
error_check_ret = self._validate_chat_template(
error_check_ret = self.validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
@@ -234,7 +234,7 @@ class OpenAIServingRender:
if error_check_ret is not None:
return error_check_ret
conversation, engine_prompts = await self._preprocess_chat(
conversation, engine_prompts = await self.preprocess_chat(
request,
request.messages,
default_template=self.chat_template,
@@ -328,7 +328,7 @@ class OpenAIServingRender:
"prompt_logprobs is not compatible with prompt embeds."
)
engine_prompts = await self._preprocess_completion(
engine_prompts = await self.preprocess_completion(
request,
prompt_input=request.prompt,
prompt_embeds=request.prompt_embeds,
@@ -426,7 +426,7 @@ class OpenAIServingRender:
) -> ErrorResponse | None:
return await self.model_registry.check_model(request.model)
def _validate_chat_template(
def validate_chat_template(
self,
request_chat_template: str | None,
chat_template_kwargs: dict[str, Any] | None,
@@ -447,7 +447,7 @@ class OpenAIServingRender:
)
return None
async def _preprocess_completion(
async def preprocess_completion(
self,
request: Any,
prompt_input: str | list[str] | list[int] | list[list[int]] | None,
@@ -490,7 +490,7 @@ class OpenAIServingRender:
},
)
async def _preprocess_chat(
async def preprocess_chat(
self,
request: Any,
messages: list[Any],

View File

@@ -11,6 +11,7 @@ from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.entrypoints.serve.tokenize.protocol import (
DetokenizeRequest,
DetokenizeResponse,
@@ -31,6 +32,7 @@ class OpenAIServingTokenization(OpenAIServing):
self,
engine_client: EngineClient,
models: OpenAIServingModels,
openai_serving_render: OpenAIServingRender,
*,
request_logger: RequestLogger | None,
chat_template: str | None,
@@ -44,6 +46,7 @@ class OpenAIServingTokenization(OpenAIServing):
request_logger=request_logger,
)
self.openai_serving_render = openai_serving_render
self.chat_template = chat_template
self.chat_template_content_format: Final = chat_template_content_format
self.default_chat_template_kwargs = default_chat_template_kwargs or {}
@@ -68,7 +71,7 @@ class OpenAIServingTokenization(OpenAIServing):
if request.tools is None
else [tool.model_dump() for tool in request.tools]
)
error_check_ret = self._validate_chat_template(
error_check_ret = self.openai_serving_render.validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
@@ -76,7 +79,7 @@ class OpenAIServingTokenization(OpenAIServing):
if error_check_ret is not None:
return error_check_ret
_, engine_prompts = await self._preprocess_chat(
_, engine_prompts = await self.openai_serving_render.preprocess_chat(
request,
request.messages,
default_template=self.chat_template,
@@ -85,7 +88,7 @@ class OpenAIServingTokenization(OpenAIServing):
tool_dicts=tool_dicts,
)
else:
engine_prompts = await self._preprocess_completion(
engine_prompts = await self.openai_serving_render.preprocess_completion(
request,
prompt_input=request.prompt,
prompt_embeds=None,