From 51085c2aebe7df2c53dbe9a44d89bc3ee761793f Mon Sep 17 00:00:00 2001
From: Kevin <140451262+kevin-pw@users.noreply.github.com>
Date: Mon, 29 Dec 2025 23:21:13 -0800
Subject: [PATCH] [Frontend] add continue_final_message parameter to
 /embeddings endpoint (#31497)

Signed-off-by: Kevin P-W <140451262+kevin-pw@users.noreply.github.com>
---
 vllm/entrypoints/pooling/embed/protocol.py | 11 ++++++++++-
 vllm/entrypoints/pooling/embed/serving.py  |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
index 6a8f8c443..3829a1a6a 100644
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -97,7 +97,16 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "model."
         ),
     )
-
+    continue_final_message: bool = Field(
+        default=False,
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
+    )
     add_special_tokens: bool = Field(
         default=False,
         description=(
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
index f5a21208e..e94b80043 100644
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -89,7 +89,7 @@ class EmbeddingMixin(OpenAIServing):
                     chat_template=ctx.request.chat_template or ctx.chat_template,
                     chat_template_content_format=ctx.chat_template_content_format,
                     add_generation_prompt=ctx.request.add_generation_prompt,
-                    continue_final_message=False,
+                    continue_final_message=ctx.request.continue_final_message,
                     add_special_tokens=ctx.request.add_special_tokens,
                 )
             else: