[Core][Hybrid allocator + kv connector 1/n] Enable hybrid allocator + KV cache connector (#25712)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu> Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
This commit is contained in:
@@ -899,6 +899,7 @@ def test_kv_connector_basic():
|
||||
scheduler = create_scheduler(
|
||||
enable_prefix_caching=True,
|
||||
use_kv_connector=True,
|
||||
disable_hybrid_kv_cache_manager=True,
|
||||
)
|
||||
NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
|
||||
BLOCK_SIZE = scheduler.cache_config.block_size
|
||||
@@ -1024,6 +1025,7 @@ def test_external_prefix_cache_metrics():
|
||||
scheduler = create_scheduler(
|
||||
enable_prefix_caching=False,
|
||||
use_kv_connector=True,
|
||||
disable_hybrid_kv_cache_manager=True,
|
||||
)
|
||||
|
||||
# Mock connector to simulate a partial external cache hit
|
||||
@@ -1088,6 +1090,7 @@ def test_kv_connector_unable_to_allocate():
|
||||
use_kv_connector=True,
|
||||
block_size=BLOCK_SIZE,
|
||||
num_blocks=NUM_BLOCKS,
|
||||
disable_hybrid_kv_cache_manager=True,
|
||||
)
|
||||
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
|
||||
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
|
||||
@@ -1171,6 +1174,7 @@ def test_kv_connector_handles_preemption():
|
||||
use_kv_connector=True,
|
||||
block_size=BLOCK_SIZE,
|
||||
num_blocks=NUM_BLOCKS,
|
||||
disable_hybrid_kv_cache_manager=True,
|
||||
)
|
||||
|
||||
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
|
||||
@@ -1387,6 +1391,7 @@ def create_scheduler_with_priority(
|
||||
block_size: int = 16,
|
||||
max_model_len: int | None = None,
|
||||
num_speculative_tokens: int | None = None,
|
||||
disable_hybrid_kv_cache_manager: bool = False,
|
||||
) -> Scheduler:
|
||||
"""Create scheduler with priority policy enabled.
|
||||
|
||||
@@ -1411,6 +1416,7 @@ def create_scheduler_with_priority(
|
||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||
enable_chunked_prefill=True,
|
||||
policy="priority", # Enable priority scheduling
|
||||
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
@@ -2018,6 +2024,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
|
||||
num_blocks=5, # Can hold 64 tokens (first block is null)
|
||||
block_size=16, # Standard block size
|
||||
use_kv_connector=True,
|
||||
disable_hybrid_kv_cache_manager=True,
|
||||
)
|
||||
|
||||
# Create a request and schedule it
|
||||
|
||||
Reference in New Issue
Block a user