[Core][KVConnector] Support HMA+NixlConnector (#35758)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -36,6 +36,7 @@ from vllm.v1.kv_cache_interface import (
|
||||
FullAttentionSpec,
|
||||
KVCacheConfig,
|
||||
KVCacheGroupSpec,
|
||||
SlidingWindowSpec,
|
||||
)
|
||||
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
||||
from vllm.v1.request import Request
|
||||
@@ -142,24 +143,26 @@ def create_vllm_config(
|
||||
def create_scheduler(
|
||||
vllm_config: VllmConfig,
|
||||
num_blocks: int = 10000,
|
||||
kv_cache_config: KVCacheConfig | None = None,
|
||||
) -> Scheduler:
|
||||
"""Initialize Scheduler For Testing."""
|
||||
block_size = vllm_config.cache_config.block_size
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks, # A large number of blocks to hold all requests
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(
|
||||
["layer"],
|
||||
FullAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
if kv_cache_config is None:
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks, # A large number of blocks to hold all requests
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(
|
||||
["layer"],
|
||||
FullAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
vllm_config.cache_config.num_gpu_blocks = num_blocks
|
||||
return Scheduler(
|
||||
vllm_config=vllm_config,
|
||||
@@ -412,3 +415,38 @@ KVConnectorFactory.register_connector(
|
||||
KVConnectorFactory.register_connector(
|
||||
"MockKVConnector", __name__, MockKVConnector.__name__
|
||||
)
|
||||
|
||||
|
||||
def make_kv_cache_config(
|
||||
block_size: int,
|
||||
hma_enabled: bool = False,
|
||||
sw_size: int = 128,
|
||||
num_blocks: int = 100,
|
||||
) -> KVCacheConfig:
|
||||
kv_cache_groups = [
|
||||
KVCacheGroupSpec(
|
||||
["layer0", "layer2"],
|
||||
FullAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=4,
|
||||
head_size=16,
|
||||
dtype=torch.float16,
|
||||
),
|
||||
)
|
||||
]
|
||||
if hma_enabled:
|
||||
kv_cache_groups.append(
|
||||
KVCacheGroupSpec(
|
||||
["layer1", "layer3"],
|
||||
SlidingWindowSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=4,
|
||||
head_size=16,
|
||||
dtype=torch.float16,
|
||||
sliding_window=sw_size,
|
||||
),
|
||||
)
|
||||
)
|
||||
return KVCacheConfig(
|
||||
num_blocks=num_blocks, kv_cache_tensors=[], kv_cache_groups=kv_cache_groups
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user