[Refactor] Introduce basic Renderer for completion-style request (#24010)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
This commit is contained in:
Flora Feng
2025-09-03 22:21:12 -07:00
committed by GitHub
parent e919d6f549
commit 712b273f65
5 changed files with 416 additions and 27 deletions

View File

@@ -4,7 +4,7 @@
import asyncio
import base64
import time
from collections.abc import AsyncGenerator, Sequence
from collections.abc import AsyncGenerator
from typing import Final, Literal, Optional, Union, cast
import jinja2
@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
PoolingRequest, PoolingResponse,
PoolingResponseData, UsageInfo)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing, RequestPrompt
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.logger import init_logger
@@ -104,6 +104,7 @@ class OpenAIServingPooling(OpenAIServing):
else:
tokenizer = await self.engine_client.get_tokenizer(lora_request
)
renderer = self._get_renderer(tokenizer)
if getattr(request, "dimensions", None) is not None:
return self.create_error_response(
@@ -126,14 +127,11 @@ class OpenAIServingPooling(OpenAIServing):
engine_prompts = await self.io_processor.pre_process_async(
prompt=validated_prompt, request_id=request_id)
request_prompts: Sequence[RequestPrompt] = [
""
] * len(engine_prompts)
elif isinstance(request, PoolingChatRequest):
(
_,
request_prompts,
_,
engine_prompts,
) = await self._preprocess_chat(
request,
@@ -149,13 +147,13 @@ class OpenAIServingPooling(OpenAIServing):
add_special_tokens=request.add_special_tokens,
)
elif isinstance(request, PoolingCompletionRequest):
(request_prompts,
engine_prompts) = await self._preprocess_completion(
request,
tokenizer,
request.input,
add_special_tokens=request.add_special_tokens,
)
engine_prompts = await renderer.render_prompt(
prompt_or_prompts=request.input,
max_length=self.max_model_len,
truncate_prompt_tokens=truncate_prompt_tokens,
add_special_tokens=request.add_special_tokens,
cache_salt=getattr(request, 'cache_salt', None),
)
else:
raise ValueError(
f"Unsupported request of type {type(request)}")
@@ -177,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
request_id_item = f"{request_id}-{i}"
self._log_inputs(request_id_item,
request_prompts[i],
engine_prompt,
params=pooling_params,
lora_request=lora_request)