[Core][KVConnector] Support HMA+NixlConnector (#35758)

Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
Nicolò Lucchesi
2026-03-06 08:51:21 +01:00
committed by GitHub
parent 90f3c01fa4
commit 5b3ba94ab4
10 changed files with 669 additions and 230 deletions

View File

@@ -36,6 +36,7 @@ from vllm.v1.kv_cache_interface import (
FullAttentionSpec,
KVCacheConfig,
KVCacheGroupSpec,
SlidingWindowSpec,
)
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
from vllm.v1.request import Request
@@ -142,24 +143,26 @@ def create_vllm_config(
def create_scheduler(
vllm_config: VllmConfig,
num_blocks: int = 10000,
kv_cache_config: KVCacheConfig | None = None,
) -> Scheduler:
"""Initialize Scheduler For Testing."""
block_size = vllm_config.cache_config.block_size
kv_cache_config = KVCacheConfig(
num_blocks=num_blocks, # A large number of blocks to hold all requests
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(
["layer"],
FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
)
],
)
if kv_cache_config is None:
kv_cache_config = KVCacheConfig(
num_blocks=num_blocks, # A large number of blocks to hold all requests
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(
["layer"],
FullAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
),
)
],
)
vllm_config.cache_config.num_gpu_blocks = num_blocks
return Scheduler(
vllm_config=vllm_config,
@@ -412,3 +415,38 @@ KVConnectorFactory.register_connector(
KVConnectorFactory.register_connector(
"MockKVConnector", __name__, MockKVConnector.__name__
)
def make_kv_cache_config(
block_size: int,
hma_enabled: bool = False,
sw_size: int = 128,
num_blocks: int = 100,
) -> KVCacheConfig:
kv_cache_groups = [
KVCacheGroupSpec(
["layer0", "layer2"],
FullAttentionSpec(
block_size=block_size,
num_kv_heads=4,
head_size=16,
dtype=torch.float16,
),
)
]
if hma_enabled:
kv_cache_groups.append(
KVCacheGroupSpec(
["layer1", "layer3"],
SlidingWindowSpec(
block_size=block_size,
num_kv_heads=4,
head_size=16,
dtype=torch.float16,
sliding_window=sw_size,
),
)
)
return KVCacheConfig(
num_blocks=num_blocks, kv_cache_tensors=[], kv_cache_groups=kv_cache_groups
)