[Core][5/N] Fully working chunked prefill e2e (#3884)

This commit is contained in:
SangBin Cho
2024-04-11 09:56:48 +09:00
committed by GitHub
parent 63e7176f26
commit 67b4221a61
26 changed files with 927 additions and 315 deletions

View File

@@ -633,7 +633,10 @@ class LLMEngine:
seq_group = scheduled_seq_group.seq_group
seq_group.update_num_computed_tokens(
scheduled_seq_group.token_chunk_size)
self._process_sequence_group_outputs(seq_group, outputs)
# If uncomputed tokens > 0, it means prefill is chunked.
# We don't need to process outputs in that case.
if seq_group.get_num_uncomputed_tokens() == 0:
self._process_sequence_group_outputs(seq_group, outputs)
# Free the finished sequence groups.
self.scheduler.free_finished_seq_groups()