[KVConnector][Core] Support cross-layer KV blocks (#27743)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
This commit is contained in:
Or Ozeri
2025-11-20 20:09:59 +02:00
committed by GitHub
parent e5bfcb6a88
commit 647464719b
15 changed files with 453 additions and 90 deletions

View File

@@ -83,10 +83,18 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
self.gpu_tensors.append(gpu_tensor)
gpu_shape = gpu_tensor.shape
test_shape = attn_backends[layer_name].get_kv_cache_shape(
attn_backend = attn_backends[layer_name]
test_shape = attn_backend.get_kv_cache_shape(
num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
)
if test_shape[0] == 1234:
if len(gpu_shape) != len(test_shape):
# cross-layers tensor
# shape is (num_blocks, ...)
assert len(gpu_shape) == len(test_shape) + 1
num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False)
elif test_shape[0] == 1234:
# shape is (num_blocks, ...)
num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False)