[kv_offload+HMA] Fix num_blocks with different per-layer page sizes and improve assert message (#38554)
Signed-off-by: Kfir Toledo <kfir.toledo@ibm.com> Co-authored-by: Or Ozeri <oro@il.ibm.com>
This commit is contained in:
@@ -83,6 +83,8 @@ class OffloadingConnectorWorker:
|
||||
if layer_name in layers
|
||||
}
|
||||
|
||||
num_blocks = self.spec.kv_cache_config.num_blocks
|
||||
|
||||
# layer_name -> list of matching KV cache tensors
|
||||
# such that each tensor starts with the num_blocks dimension.
|
||||
# FlashAttention layers which use the (2, num_blocks, ...) layout
|
||||
@@ -132,7 +134,6 @@ class OffloadingConnectorWorker:
|
||||
num_blocks_logical_dim
|
||||
)
|
||||
if num_blocks_physical_dim == 0:
|
||||
num_blocks = layer_kv_cache.shape[num_blocks_logical_dim]
|
||||
storage = layer_kv_cache.untyped_storage()
|
||||
page = layer_kv_cache_spec.page_size_bytes
|
||||
tensors_per_block[layer_name] = (
|
||||
@@ -154,7 +155,6 @@ class OffloadingConnectorWorker:
|
||||
assert num_blocks_physical_dim == 1
|
||||
|
||||
# unbind the tensor to separate K and V tensors
|
||||
num_blocks = layer_kv_cache.shape[num_blocks_logical_dim]
|
||||
half_page_size = layer_kv_cache_spec.page_size_bytes // 2
|
||||
storage = layer_kv_cache.untyped_storage()
|
||||
raw = (
|
||||
@@ -181,7 +181,6 @@ class OffloadingConnectorWorker:
|
||||
assert len(state_tensors) > 0
|
||||
first_state_tensor = state_tensors[0]
|
||||
assert first_state_tensor.storage_offset() == 0
|
||||
num_blocks = first_state_tensor.shape[0]
|
||||
tensor = (
|
||||
torch.tensor(
|
||||
[],
|
||||
|
||||
@@ -93,7 +93,12 @@ class OffloadingSpec(ABC):
|
||||
)
|
||||
|
||||
for block_size in self.gpu_block_size:
|
||||
assert block_size % self.hash_block_size == 0
|
||||
assert block_size % self.hash_block_size == 0, (
|
||||
f"gpu_block_size={block_size} not divisible by "
|
||||
f"hash_block_size={self.hash_block_size}. "
|
||||
f"Hybrid models (e.g. Mamba+Attention) need "
|
||||
f"--enable-prefix-caching to align block sizes."
|
||||
)
|
||||
|
||||
# offloaded_block_size / gpu_block_size
|
||||
self.block_size_factor: int = 1
|
||||
|
||||
Reference in New Issue
Block a user