[Frontend] Delegate preprocessing to OpenAIServingRender (#36483)

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
This commit is contained in:
Sage
2026-03-13 09:39:43 +02:00
committed by GitHub
parent a4ad9db541
commit a2268617cf
10 changed files with 203 additions and 196 deletions

View File

@@ -21,8 +21,13 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.models.serving import (
BaseModelPath,
OpenAIModelRegistry,
OpenAIServingModels,
)
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.exceptions import VLLMValidationError
from vllm.inputs import TokensPrompt
from vllm.outputs import CompletionOutput, RequestOutput
@@ -557,15 +562,32 @@ def _build_renderer(model_config: MockModelConfig):
)
def _build_serving_render(
engine, model_registry: OpenAIModelRegistry
) -> OpenAIServingRender:
return OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=model_registry,
request_logger=None,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
)
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
)
openai_serving_render = _build_serving_render(engine, models.registry)
serving_chat = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
@@ -586,10 +608,13 @@ async def _async_serving_chat_init():
engine = MockEngine()
models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
openai_serving_render = _build_serving_render(engine, models.registry)
serving_completion = OpenAIServingChat(
engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
@@ -1182,7 +1207,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1209,7 +1236,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1230,7 +1259,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1274,7 +1305,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1311,7 +1344,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1355,7 +1390,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1392,7 +1429,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
[
@@ -1436,7 +1475,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages(
input_messages_2,
[
@@ -1486,7 +1527,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
input_messages_3, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_3)
)
verify_harmony_messages(
input_messages_3,
[
@@ -1549,7 +1592,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
input_messages_4, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_4)
)
verify_harmony_messages(
input_messages_4,
[
@@ -1598,7 +1643,9 @@ class TestServingChatWithHarmony:
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
@@ -1629,7 +1676,9 @@ class TestServingChatWithHarmony:
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
@@ -1658,7 +1707,9 @@ class TestServingChatWithHarmony:
},
]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req)
input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages(
input_messages,
@@ -1689,11 +1740,14 @@ async def test_tool_choice_validation_without_parser():
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
)
openai_serving_render = _build_serving_render(mock_engine, models.registry)
# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat = OpenAIServingChat(
mock_engine,
models,
response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,