[Renderer] Move InputPreprocessor into Renderer (2/2) (#34560)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-02-17 21:29:01 +08:00
committed by GitHub
parent c61a98f529
commit 574fe75245
32 changed files with 984 additions and 1054 deletions

View File

@@ -67,13 +67,12 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
)
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.inputs.data import TokensPrompt
from vllm.inputs.data import ProcessorInputs, TokensPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.parser import ParserManager
from vllm.reasoning import ReasoningParser
from vllm.renderers.inputs import TokPrompt
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import (
@@ -221,7 +220,7 @@ class OpenAIServingChat(OpenAIServing):
async def render_chat_request(
self,
request: ChatCompletionRequest,
) -> tuple[list[ConversationMessage], list[TokPrompt]] | ErrorResponse:
) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse:
"""
render chat request by validating and preprocessing inputs.
@@ -380,7 +379,9 @@ class OpenAIServingChat(OpenAIServing):
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
for i, engine_prompt in enumerate(engine_prompts):
prompt_text = self._extract_prompt_text(engine_prompt)
prompt_token_ids = self._extract_prompt_components(
engine_prompt
).token_ids
# If we are creating sub requests for multiple prompts, ensure that they
# have unique request ids.
@@ -431,35 +432,21 @@ class OpenAIServingChat(OpenAIServing):
trace_headers=trace_headers,
)
else:
tok_params = request.build_tok_params(self.model_config)
tokenization_kwargs = tok_params.get_encode_kwargs()
reasoning_ended = (
reasoning_parser.is_reasoning_end(prompt_token_ids or [])
if reasoning_parser
else None
)
engine_request = self.input_processor.process_inputs(
sub_request_id,
generator = self.engine_client.generate(
engine_prompt,
sampling_params,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
trace_headers=trace_headers,
priority=request.priority,
data_parallel_rank=data_parallel_rank,
)
reasoning_ended = None
if reasoning_parser:
reasoning_ended = reasoning_parser.is_reasoning_end(
engine_request.prompt_token_ids or [] # type: ignore[attr-defined]
)
engine_request.reasoning_ended = reasoning_ended
generator = self.engine_client.generate(
engine_request,
sampling_params,
sub_request_id,
lora_request=lora_request,
trace_headers=trace_headers,
priority=request.priority,
prompt_text=prompt_text,
tokenization_kwargs=tokenization_kwargs,
data_parallel_rank=data_parallel_rank,
reasoning_ended=reasoning_ended,
)
generators.append(generator)