[Core] support LoRA and prompt adapter in content-based hashing for Block Manager v2 prefix caching (#8240)

This commit is contained in:
Sungjae Lee
2024-12-14 00:51:25 +09:00
committed by GitHub
parent d1fa714cb1
commit c31d4a57a6
10 changed files with 244 additions and 53 deletions

View File

@@ -527,6 +527,19 @@ class Sequence:
hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
return hash((hashed_tokens, self.lora_int_id))
def extra_hash(self) -> Optional[int]:
"""
This function computes an extra hash for a sequence, specifically
designed for prefix caching mode. The final sequence hash is determined
by applying token_ids from the sequence's blocks.
"""
if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
return None
# NOTE: If there are additional factors influencing the block aside from
# token_ids, include them as input parameters to the hash.
return hash((self.prompt_adapter_id, self.lora_int_id))
def num_hashed_tokens_of_block(self, logical_idx: int):
return logical_idx * self.block_size + self.block_size