From 6cc7abdc66b98ef66141dd89f94f5f265e427f23 Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Tue, 31 Mar 2026 09:00:40 +0300 Subject: [PATCH] [kv_offload+HMA] Fix num_blocks with different per-layer page sizes and improve assert message (#38554) Signed-off-by: Kfir Toledo Co-authored-by: Or Ozeri --- .../kv_transfer/kv_connector/v1/offloading/worker.py | 5 ++--- vllm/v1/kv_offload/spec.py | 7 ++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py index 77398eee8..23c62b6ec 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py @@ -83,6 +83,8 @@ class OffloadingConnectorWorker: if layer_name in layers } + num_blocks = self.spec.kv_cache_config.num_blocks + # layer_name -> list of matching KV cache tensors # such that each tensor starts with the num_blocks dimension. # FlashAttention layers which use the (2, num_blocks, ...) layout @@ -132,7 +134,6 @@ class OffloadingConnectorWorker: num_blocks_logical_dim ) if num_blocks_physical_dim == 0: - num_blocks = layer_kv_cache.shape[num_blocks_logical_dim] storage = layer_kv_cache.untyped_storage() page = layer_kv_cache_spec.page_size_bytes tensors_per_block[layer_name] = ( @@ -154,7 +155,6 @@ class OffloadingConnectorWorker: assert num_blocks_physical_dim == 1 # unbind the tensor to separate K and V tensors - num_blocks = layer_kv_cache.shape[num_blocks_logical_dim] half_page_size = layer_kv_cache_spec.page_size_bytes // 2 storage = layer_kv_cache.untyped_storage() raw = ( @@ -181,7 +181,6 @@ class OffloadingConnectorWorker: assert len(state_tensors) > 0 first_state_tensor = state_tensors[0] assert first_state_tensor.storage_offset() == 0 - num_blocks = first_state_tensor.shape[0] tensor = ( torch.tensor( [], diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index 1eb4fdb3e..b66b04ffb 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -93,7 +93,12 @@ class OffloadingSpec(ABC): ) for block_size in self.gpu_block_size: - assert block_size % self.hash_block_size == 0 + assert block_size % self.hash_block_size == 0, ( + f"gpu_block_size={block_size} not divisible by " + f"hash_block_size={self.hash_block_size}. " + f"Hybrid models (e.g. Mamba+Attention) need " + f"--enable-prefix-caching to align block sizes." + ) # offloaded_block_size / gpu_block_size self.block_size_factor: int = 1