[EC Connector] Optimize remote cache check in scheduler (#32585)

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
This commit is contained in:
knlnguyen1802
2026-01-22 11:30:59 +08:00
committed by GitHub
parent c5487e2b96
commit 378385b90c
5 changed files with 76 additions and 57 deletions

View File

@@ -2560,15 +2560,14 @@ def test_ec_connector_cache_hit_external_load(use_kv_connector):
mm_positions=mm_positions,
)[0]
# Mock cache hit - encoder cache exists externally
scheduler.ec_connector.has_caches = Mock(return_value=[True])
# Mock cache hit - encoder cache has_exists externally
scheduler.ec_connector.has_cache_item = Mock(return_value=True)
scheduler.ec_connector.update_state_after_alloc = Mock(
wraps=scheduler.ec_connector.update_state_after_alloc
)
scheduler.add_request(request)
output = scheduler.schedule()
# Should schedule prompt tokens
scheduled_tokens = output.num_scheduled_tokens[request.request_id]
assert scheduled_tokens == NUM_TOKENS
@@ -2611,7 +2610,7 @@ def test_ec_connector_cache_miss_computes_locally(use_kv_connector):
)[0]
# Mock cache miss - encoder cache doesn't exist externally
scheduler.ec_connector.has_caches = Mock(return_value=[False])
scheduler.ec_connector.has_cache_item = Mock(return_value=False)
scheduler.add_request(request_mm_missed)
output = scheduler.schedule()
@@ -2665,7 +2664,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_1),
]
]
has_cache_item_result_map_1 = {"hash1_A": False, "hash1_B": True, "hash1_F": True}
# Create request with 4 MM items, with 2 identical items
request1 = create_requests(
num_requests=1,
@@ -2676,7 +2675,9 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
)[0]
# Mock partial cache hit: 1st and 3rd missing, 2nd and 4th exist
scheduler.ec_connector.has_caches = Mock(return_value=[False, True, False, True])
scheduler.ec_connector.has_cache_item = Mock(
side_effect=lambda hash_val: has_cache_item_result_map_1[hash_val]
)
scheduler.ec_connector.update_state_after_alloc = Mock(
wraps=scheduler.ec_connector.update_state_after_alloc
)
@@ -2736,7 +2737,12 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_2),
]
]
has_cache_item_result_map_2 = {
"hash1_C": True,
"hash1_D": False,
"hash1_E": False,
"hash1_A": True,
}
request2 = create_requests(
num_requests=1,
num_tokens=NUM_TOKENS_2,
@@ -2746,7 +2752,9 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
)[0]
# Mock partial cache hit: only hash1_A and hash1_C exist in connector
scheduler.ec_connector.has_caches = Mock(return_value=[True, False, False, True])
scheduler.ec_connector.has_cache_item = Mock(
side_effect=lambda hash_val: has_cache_item_result_map_2[hash_val]
)
scheduler.add_request(request2)
output = scheduler.schedule()
@@ -2821,9 +2829,9 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
if cache_exist == "connector_only":
# Cache exist in ec_connector
scheduler.ec_connector.has_caches = Mock(return_value=[True])
scheduler.ec_connector.has_cache_item = Mock(return_value=True)
elif cache_exist == "no_where":
scheduler.ec_connector.has_caches = Mock(return_value=[False])
scheduler.ec_connector.has_cache_item = Mock(return_value=False)
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == len(requests)
@@ -2887,7 +2895,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
)
# Mock ec_connector load external cache behavior
scheduler.ec_connector.has_caches = Mock(return_value=[True])
scheduler.ec_connector.has_cache_item = Mock(return_value=True)
scheduler.ec_connector.update_state_after_alloc = Mock(
wraps=scheduler.ec_connector.update_state_after_alloc
)
@@ -2984,7 +2992,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
)
# Mock cache hit: Both cache exist in connector (at E->PD initially)
scheduler.ec_connector.has_caches = Mock(return_value=[True])
scheduler.ec_connector.has_cache_item = Mock(return_value=True)
scheduler.ec_connector.update_state_after_alloc = Mock(
wraps=scheduler.ec_connector.update_state_after_alloc
)
@@ -3139,9 +3147,9 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
if cache_exist == "connector_only":
# Cache exist in ec_connector
scheduler.ec_connector.has_caches = Mock(return_value=[True])
scheduler.ec_connector.has_cache_item = Mock(return_value=True)
elif cache_exist == "no_where":
scheduler.ec_connector.has_caches = Mock(return_value=[False])
scheduler.ec_connector.has_cache_item = Mock(return_value=False)
# 4th Schedule - this should trigger req_low resumption from waiting
output = scheduler.schedule()
@@ -3259,8 +3267,8 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
)[0]
# Mock cache hit: MM of request1 NOT cached remotely, request2 cached remotely
scheduler.ec_connector.has_caches = Mock(
side_effect=lambda req: [True, True, True] if req == request2 else [False]
scheduler.ec_connector.has_cache_item = Mock(
side_effect=lambda hash_value: hash_value in mm_hashes_list_2[0]
)
scheduler.ec_connector.update_state_after_alloc = Mock(
wraps=scheduler.ec_connector.update_state_after_alloc