From 51085c2aebe7df2c53dbe9a44d89bc3ee761793f Mon Sep 17 00:00:00 2001 From: Kevin <140451262+kevin-pw@users.noreply.github.com> Date: Mon, 29 Dec 2025 23:21:13 -0800 Subject: [PATCH] [Frontend] add continue_final_message parameter to /embeddings endpoint (#31497) Signed-off-by: Kevin P-W <140451262+kevin-pw@users.noreply.github.com> --- vllm/entrypoints/pooling/embed/protocol.py | 11 ++++++++++- vllm/entrypoints/pooling/embed/serving.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py index 6a8f8c443..3829a1a6a 100644 --- a/vllm/entrypoints/pooling/embed/protocol.py +++ b/vllm/entrypoints/pooling/embed/protocol.py @@ -97,7 +97,16 @@ class EmbeddingChatRequest(OpenAIBaseModel): "model." ), ) - + continue_final_message: bool = Field( + default=False, + description=( + "If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + 'This allows you to "prefill" part of the model\'s response for it. ' + "Cannot be used at the same time as `add_generation_prompt`." + ), + ) add_special_tokens: bool = Field( default=False, description=( diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py index f5a21208e..e94b80043 100644 --- a/vllm/entrypoints/pooling/embed/serving.py +++ b/vllm/entrypoints/pooling/embed/serving.py @@ -89,7 +89,7 @@ class EmbeddingMixin(OpenAIServing): chat_template=ctx.request.chat_template or ctx.chat_template, chat_template_content_format=ctx.chat_template_content_format, add_generation_prompt=ctx.request.add_generation_prompt, - continue_final_message=False, + continue_final_message=ctx.request.continue_final_message, add_special_tokens=ctx.request.add_special_tokens, ) else: