[Frontend] Added support for HF's new continue_final_message parameter (#8942)

2024-09-29 20:59:47 +03:00
parent 1fb9c1b0bf
commit 6c9ba48fde
7 changed files with 102 additions and 28 deletions
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -211,6 +211,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
         "This is a parameter used by chat template in tokenizer config of the "
         "model."),
    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=
+        ("If this is set, the chat will be formatted so that the final "
+         "message in the chat is open-ended, without any EOS tokens. The "
+         "model will continue this message rather than starting a new one. "
+         "This allows you to \"prefill\" part of the model's response for it. "
+         "Cannot be used at the same time as `add_generation_prompt`."),
+    )
    add_special_tokens: bool = Field(
        default=False,
        description=(
@@ -431,6 +440,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
                        " of the specified `tools`")
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get(
+                "add_generation_prompt"):
+            raise ValueError("Cannot set both `continue_final_message` and "
+                             "`add_generation_prompt` to True.")
+        return data
+

 class CompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
@@ -862,8 +880,18 @@ class TokenizeChatRequest(OpenAIBaseModel):
    messages: List[ChatCompletionMessageParam]

    add_generation_prompt: bool = Field(default=True)
+    continue_final_message: bool = Field(default=False)
    add_special_tokens: bool = Field(default=False)

+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get(
+                "add_generation_prompt"):
+            raise ValueError("Cannot set both `continue_final_message` and "
+                             "`add_generation_prompt` to True.")
+        return data
+

 TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]

--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -140,6 +140,7 @@ class OpenAIServingChat(OpenAIServing):
                    messages=request.messages,
                    chat_template=request.chat_template or self.chat_template,
                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
                    tools=tool_dicts,
                    documents=request.documents,
                    **(request.chat_template_kwargs or {}),
@@ -150,6 +151,7 @@ class OpenAIServingChat(OpenAIServing):
                    conversation=conversation,
                    chat_template=request.chat_template or self.chat_template,
                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
                    tools=tool_dicts,
                    documents=request.documents,
                    **(request.chat_template_kwargs or {}),
@@ -361,7 +363,7 @@ class OpenAIServingChat(OpenAIServing):

                    # Send response to echo the input portion of the
                    # last message
-                    if request.echo:
+                    if request.echo or request.continue_final_message:
                        last_msg_content: str = ""
                        if conversation and "content" in conversation[
                                -1] and conversation[-1].get("role") == role:
@@ -716,7 +718,7 @@ class OpenAIServingChat(OpenAIServing):
                stop_reason=output.stop_reason)
            choices.append(choice_data)

-        if request.echo:
+        if request.echo or request.continue_final_message:
            last_msg_content = ""
            if conversation and "content" in conversation[-1] and conversation[
                    -1].get("role") == role:
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -87,6 +87,7 @@ class OpenAIServingTokenization(OpenAIServing):
                    messages=request.messages,
                    chat_template=self.chat_template,
                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
                )
            else:
                prompt = apply_hf_chat_template(
@@ -94,6 +95,7 @@ class OpenAIServingTokenization(OpenAIServing):
                    conversation=conversation,
                    chat_template=self.chat_template,
                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
                )
        else:
            prompt = request.prompt