Optimize data movement (#20)

2023-04-02 00:30:17 -07:00
parent 1f01a18d39
commit 897cb2ae28
17 changed files with 275 additions and 135 deletions
--- a/cacheflow/worker/worker.py
+++ b/cacheflow/worker/worker.py
@@ -128,6 +128,11 @@ class Worker:
                slot = block_number * self.block_size + block_offset
                slot_mapping.append(slot)

+        cumulative_prompt_lens: List[int] = [0]
+        for prompt_len in prompt_lens:
+            cumulative_prompt_lens.append(
+                cumulative_prompt_lens[-1] + prompt_len)
+
        # Add generation tokens.
        max_context_len = 0
        max_num_blocks_per_seq = 0
@@ -183,11 +188,14 @@ class Worker:
            for block_table in generation_block_tables]
        block_tables_tensor = torch.tensor(
            padded_block_tables, dtype=torch.int, device='cuda')
+        cumulative_prompt_lens_tensor = torch.tensor(
+            cumulative_prompt_lens, dtype=torch.int, device='cuda')

        input_metadata = InputMetadata(
            seq_groups=seq_groups,
            seq_logprobs=seq_logprobs,
            prompt_lens=prompt_lens,
+            cumulative_prompt_lens=cumulative_prompt_lens_tensor,
            slot_mapping=slot_mapping_tensor,
            context_lens=context_lens_tensor,
            max_context_len=max_context_len,