Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
This commit is contained in:
@@ -212,6 +212,7 @@ class Processor:
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
priority: int = 0,
|
||||
data_parallel_rank: Optional[int] = None,
|
||||
) -> tuple[Optional[str], EngineCoreRequest]:
|
||||
|
||||
# TODO(woosuk): Support pooling models.
|
||||
@@ -328,6 +329,7 @@ class Processor:
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
cache_salt=decoder_inputs.get("cache_salt"),
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
)
|
||||
|
||||
def _validate_model_inputs(self,
|
||||
|
||||
Reference in New Issue
Block a user