[P/D] NIXL Integration (#17751)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Brent Salisbury <bsalisbu@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Brent Salisbury <bsalisbu@redhat.com>
This commit is contained in:
Robert Shaw
2025-05-12 12:46:16 -04:00
committed by GitHub
parent 05a4324f8e
commit d19110204c
34 changed files with 2723 additions and 108 deletions

View File

@@ -403,6 +403,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit). Not supported by vLLM engine V0."))
kv_transfer_params: Optional[dict[str, Any]] = Field(
default=None,
description="KVTransfer parameters used for disaggregated serving.")
# doc: end-chat-completion-extra-params
@@ -540,7 +543,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
output_kind=RequestOutputKind.DELTA if self.stream \
else RequestOutputKind.FINAL_ONLY,
guided_decoding=guided_decoding,
logit_bias=self.logit_bias)
logit_bias=self.logit_bias,
extra_args=({"kv_transfer_params": self.kv_transfer_params}
if self.kv_transfer_params else None))
def _get_guided_json_from_tool(
self) -> Optional[Union[str, dict, BaseModel]]:
@@ -848,6 +853,10 @@ class CompletionRequest(OpenAIBaseModel):
" as strings of the form 'token_id:{token_id}' so that tokens "
"that are not JSON-encodable can be identified."))
kv_transfer_params: Optional[dict[str, Any]] = Field(
default=None,
description="KVTransfer parameters used for disaggregated serving.")
# doc: end-completion-extra-params
# Default sampling parameters for completion requests
@@ -973,7 +982,9 @@ class CompletionRequest(OpenAIBaseModel):
else RequestOutputKind.FINAL_ONLY,
guided_decoding=guided_decoding,
logit_bias=self.logit_bias,
allowed_token_ids=self.allowed_token_ids)
allowed_token_ids=self.allowed_token_ids,
extra_args=({"kv_transfer_params": self.kv_transfer_params}
if self.kv_transfer_params else None))
@model_validator(mode="before")
@classmethod
@@ -1223,6 +1234,8 @@ class CompletionResponse(OpenAIBaseModel):
model: str
choices: list[CompletionResponseChoice]
usage: UsageInfo
kv_transfer_params: Optional[dict[str, Any]] = Field(
default=None, description="KVTransfer parameters.")
class CompletionResponseStreamChoice(OpenAIBaseModel):
@@ -1412,6 +1425,8 @@ class ChatCompletionResponse(OpenAIBaseModel):
choices: list[ChatCompletionResponseChoice]
usage: UsageInfo
prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
kv_transfer_params: Optional[dict[str, Any]] = Field(
default=None, description="KVTransfer parameters.")
class DeltaMessage(OpenAIBaseModel):