[Core] Async_output_proc: Add virtual engine support (towards pipeline parallel) (#7911)

This commit is contained in:
Alexander Matveev
2024-08-28 03:02:30 -04:00
committed by GitHub
parent 51f86bf487
commit f508e03e7f
6 changed files with 123 additions and 68 deletions

View File

@@ -811,6 +811,9 @@ class SequenceGroup:
self.is_single_seq = len(self.seqs) == 1
def is_finished(self) -> bool:
if self.is_single_seq:
return self.seqs[0].is_finished()
return all(seq.is_finished() for seq in self.seqs)
def is_prefill(self) -> bool:
@@ -1290,8 +1293,8 @@ class ExecuteModelRequest(
finished_requests_ids: List[str] = msgspec.field(default_factory=list)
# The last sampled token ids for multi step decoding.
last_sampled_token_ids: Optional[torch.Tensor] = None
# Async postprocessor
output_proc_callback_fn: Optional[Callable] = None
# Async callback
async_callback: Optional[Callable] = None
@property
def is_first_multi_step(self) -> bool:
@@ -1338,4 +1341,4 @@ class ExecuteModelRequest(
finished_requests_ids=self.finished_requests_ids,
last_sampled_token_ids=self.last_sampled_token_ids.clone()
if self.last_sampled_token_ids is not None else None,
output_proc_callback_fn=self.output_proc_callback_fn)
async_callback=self.async_callback)