[Core][Hybrid allocator + kv connector 1/n] Enable hybrid allocator + KV cache connector (#25712)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
This commit is contained in:
Kuntai Du
2025-10-24 23:34:18 -07:00
committed by GitHub
parent 56ed7609a9
commit b853540388
15 changed files with 113 additions and 18 deletions

View File

@@ -899,6 +899,7 @@ def test_kv_connector_basic():
scheduler = create_scheduler(
enable_prefix_caching=True,
use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
)
NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
BLOCK_SIZE = scheduler.cache_config.block_size
@@ -1024,6 +1025,7 @@ def test_external_prefix_cache_metrics():
scheduler = create_scheduler(
enable_prefix_caching=False,
use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
)
# Mock connector to simulate a partial external cache hit
@@ -1088,6 +1090,7 @@ def test_kv_connector_unable_to_allocate():
use_kv_connector=True,
block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS,
disable_hybrid_kv_cache_manager=True,
)
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
@@ -1171,6 +1174,7 @@ def test_kv_connector_handles_preemption():
use_kv_connector=True,
block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS,
disable_hybrid_kv_cache_manager=True,
)
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
@@ -1387,6 +1391,7 @@ def create_scheduler_with_priority(
block_size: int = 16,
max_model_len: int | None = None,
num_speculative_tokens: int | None = None,
disable_hybrid_kv_cache_manager: bool = False,
) -> Scheduler:
"""Create scheduler with priority policy enabled.
@@ -1411,6 +1416,7 @@ def create_scheduler_with_priority(
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True,
policy="priority", # Enable priority scheduling
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
)
model_config = ModelConfig(
model=model,
@@ -2018,6 +2024,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
num_blocks=5, # Can hold 64 tokens (first block is null)
block_size=16, # Standard block size
use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
)
# Create a request and schedule it