[Frontend] add continue_final_message parameter to /embeddings endpoint (#31497)

Signed-off-by: Kevin P-W <140451262+kevin-pw@users.noreply.github.com>
2025-12-29 23:21:13 -08:00
parent 3d973764ce
commit 51085c2aeb
2 changed files with 11 additions and 2 deletions
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -97,7 +97,16 @@ class EmbeddingChatRequest(OpenAIBaseModel):
            "model."
        ),
    )
-
+    continue_final_message: bool = Field(
+        default=False,
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
+    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -89,7 +89,7 @@ class EmbeddingMixin(OpenAIServing):
                    chat_template=ctx.request.chat_template or ctx.chat_template,
                    chat_template_content_format=ctx.chat_template_content_format,
                    add_generation_prompt=ctx.request.add_generation_prompt,
-                    continue_final_message=False,
+                    continue_final_message=ctx.request.continue_final_message,
                    add_special_tokens=ctx.request.add_special_tokens,
                )
            else: