[Frontend] Use new Renderer for Completions and Tokenize API (#32863)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -13,12 +13,13 @@ from openai.types.chat.chat_completion_audio import (
|
||||
ChatCompletionAudio as OpenAIChatCompletionAudio,
|
||||
)
|
||||
from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
|
||||
from pydantic import (
|
||||
Field,
|
||||
model_validator,
|
||||
)
|
||||
from pydantic import Field, model_validator
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionMessageParam,
|
||||
ChatTemplateContentFormatOption,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
AnyResponseFormat,
|
||||
DeltaMessage,
|
||||
@@ -36,6 +37,7 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
from vllm.exceptions import VLLMValidationError
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs
|
||||
from vllm.sampling_params import (
|
||||
BeamSearchParams,
|
||||
RequestOutputKind,
|
||||
@@ -348,6 +350,43 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
|
||||
# --8<-- [end:chat-completion-extra-params]
|
||||
|
||||
def build_chat_params(
|
||||
self,
|
||||
default_template: str | None,
|
||||
default_template_content_format: ChatTemplateContentFormatOption,
|
||||
) -> ChatParams:
|
||||
return ChatParams(
|
||||
chat_template=self.chat_template or default_template,
|
||||
chat_template_content_format=default_template_content_format,
|
||||
chat_template_kwargs=merge_kwargs(
|
||||
self.chat_template_kwargs,
|
||||
dict(
|
||||
add_generation_prompt=self.add_generation_prompt,
|
||||
continue_final_message=self.continue_final_message,
|
||||
documents=self.documents,
|
||||
reasoning_effort=self.reasoning_effort,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
if self.max_completion_tokens is not None:
|
||||
max_output_tokens: int | None = self.max_completion_tokens
|
||||
max_output_tokens_param = "max_completion_tokens"
|
||||
else:
|
||||
max_output_tokens = self.max_tokens
|
||||
max_output_tokens_param = "max_tokens"
|
||||
|
||||
return TokenizeParams(
|
||||
max_total_tokens=model_config.max_model_len,
|
||||
max_output_tokens=max_output_tokens or 0,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
add_special_tokens=self.add_special_tokens,
|
||||
needs_detokenization=bool(self.echo and not self.return_token_ids),
|
||||
max_total_tokens_param="max_model_len",
|
||||
max_output_tokens_param=max_output_tokens_param,
|
||||
)
|
||||
|
||||
# Default sampling parameters for chat completion requests
|
||||
_DEFAULT_SAMPLING_PARAMS: dict = {
|
||||
"repetition_penalty": 1.0,
|
||||
|
||||
@@ -67,7 +67,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
|
||||
)
|
||||
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
|
||||
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.inputs.data import EmbedsPrompt, TokensPrompt
|
||||
from vllm.inputs.parse import get_prompt_components
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
@@ -185,8 +185,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
renderer = self.engine_client.renderer
|
||||
|
||||
# Create a minimal dummy request
|
||||
dummy_request = ChatCompletionRequest(
|
||||
messages=[{"role": "user", "content": "warmup"}],
|
||||
@@ -201,18 +199,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# 3. Tokenizer initialization for chat
|
||||
await self._preprocess_chat(
|
||||
dummy_request,
|
||||
renderer,
|
||||
dummy_request.messages,
|
||||
chat_template=self.chat_template,
|
||||
chat_template_content_format=self.chat_template_content_format,
|
||||
add_generation_prompt=True,
|
||||
continue_final_message=False,
|
||||
tool_dicts=None,
|
||||
documents=None,
|
||||
chat_template_kwargs=None,
|
||||
default_chat_template_kwargs=self.default_chat_template_kwargs,
|
||||
tool_parser=None,
|
||||
add_special_tokens=False,
|
||||
default_template=self.chat_template,
|
||||
default_template_content_format=self.chat_template_content_format,
|
||||
default_template_kwargs=self.default_chat_template_kwargs,
|
||||
)
|
||||
|
||||
elapsed = (time.perf_counter() - start_time) * 1000
|
||||
@@ -225,7 +215,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
async def render_chat_request(
|
||||
self,
|
||||
request: ChatCompletionRequest,
|
||||
) -> tuple[list[ConversationMessage], list[Any]] | ErrorResponse:
|
||||
) -> (
|
||||
tuple[list[ConversationMessage], list[TokensPrompt | EmbedsPrompt]]
|
||||
| ErrorResponse
|
||||
):
|
||||
"""
|
||||
render chat request by validating and preprocessing inputs.
|
||||
|
||||
@@ -302,23 +295,14 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
chat_template_kwargs = request.chat_template_kwargs or {}
|
||||
chat_template_kwargs.update(reasoning_effort=request.reasoning_effort)
|
||||
|
||||
conversation, engine_prompts = await self._preprocess_chat(
|
||||
request,
|
||||
renderer,
|
||||
request.messages,
|
||||
chat_template=request.chat_template or self.chat_template,
|
||||
chat_template_content_format=self.chat_template_content_format,
|
||||
add_generation_prompt=request.add_generation_prompt,
|
||||
continue_final_message=request.continue_final_message,
|
||||
default_template=self.chat_template,
|
||||
default_template_content_format=self.chat_template_content_format,
|
||||
default_template_kwargs=self.default_chat_template_kwargs,
|
||||
tool_dicts=tool_dicts,
|
||||
documents=request.documents,
|
||||
chat_template_kwargs=chat_template_kwargs,
|
||||
default_chat_template_kwargs=self.default_chat_template_kwargs,
|
||||
tool_parser=tool_parser,
|
||||
add_special_tokens=request.add_special_tokens,
|
||||
)
|
||||
else:
|
||||
# For GPT-OSS.
|
||||
@@ -428,11 +412,15 @@ class OpenAIServingChat(OpenAIServing):
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
else:
|
||||
engine_request, tokenization_kwargs = await self._process_inputs(
|
||||
tok_params = request.build_tok_params(self.model_config)
|
||||
tokenization_kwargs = tok_params.get_encode_kwargs()
|
||||
|
||||
engine_request = self.input_processor.process_inputs(
|
||||
sub_request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
|
||||
Reference in New Issue
Block a user