[Refactor] Introduce basic Renderer for completion-style request (#24010)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, Sequence
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Final, Literal, Optional, Union, cast
|
||||
|
||||
import jinja2
|
||||
@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||
PoolingRequest, PoolingResponse,
|
||||
PoolingResponseData, UsageInfo)
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, RequestPrompt
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.logger import init_logger
|
||||
@@ -104,6 +104,7 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request
|
||||
)
|
||||
renderer = self._get_renderer(tokenizer)
|
||||
|
||||
if getattr(request, "dimensions", None) is not None:
|
||||
return self.create_error_response(
|
||||
@@ -126,14 +127,11 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
|
||||
engine_prompts = await self.io_processor.pre_process_async(
|
||||
prompt=validated_prompt, request_id=request_id)
|
||||
request_prompts: Sequence[RequestPrompt] = [
|
||||
""
|
||||
] * len(engine_prompts)
|
||||
|
||||
elif isinstance(request, PoolingChatRequest):
|
||||
(
|
||||
_,
|
||||
request_prompts,
|
||||
_,
|
||||
engine_prompts,
|
||||
) = await self._preprocess_chat(
|
||||
request,
|
||||
@@ -149,13 +147,13 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
elif isinstance(request, PoolingCompletionRequest):
|
||||
(request_prompts,
|
||||
engine_prompts) = await self._preprocess_completion(
|
||||
request,
|
||||
tokenizer,
|
||||
request.input,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
engine_prompts = await renderer.render_prompt(
|
||||
prompt_or_prompts=request.input,
|
||||
max_length=self.max_model_len,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
cache_salt=getattr(request, 'cache_salt', None),
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported request of type {type(request)}")
|
||||
@@ -177,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
request_id_item = f"{request_id}-{i}"
|
||||
|
||||
self._log_inputs(request_id_item,
|
||||
request_prompts[i],
|
||||
engine_prompt,
|
||||
params=pooling_params,
|
||||
lora_request=lora_request)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user