[Core] support LoRA and prompt adapter in content-based hashing for Block Manager v2 prefix caching (#8240)

2024-12-14 00:51:25 +09:00
parent d1fa714cb1
commit c31d4a57a6
10 changed files with 244 additions and 53 deletions
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -527,6 +527,19 @@ class Sequence:
        hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
        return hash((hashed_tokens, self.lora_int_id))

+    def extra_hash(self) -> Optional[int]:
+        """
+        This function computes an extra hash for a sequence, specifically
+        designed for prefix caching mode. The final sequence hash is determined
+        by applying token_ids from the sequence's blocks.
+        """
+        if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
+            return None
+
+        # NOTE: If there are additional factors influencing the block aside from
+        # token_ids, include them as input parameters to the hash.
+        return hash((self.prompt_adapter_id, self.lora_int_id))
+
    def num_hashed_tokens_of_block(self, logical_idx: int):
        return logical_idx * self.block_size + self.block_size