Optimize KV cache distribution for asymmetric pipeline parallelism (#25164)

Signed-off-by: gholmes829 <g.holmes429@gmail.com>
This commit is contained in:
Grant Holmes (Ren)
2025-10-07 04:20:30 -05:00
committed by GitHub
parent 7e4cd070b0
commit d100d78eb3
5 changed files with 64 additions and 38 deletions

View File

@@ -681,10 +681,10 @@ def test_get_kv_cache_configs_multiple_workers():
num_blocks=10,
kv_cache_tensors=[
KVCacheTensor(
size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer1"]
size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
),
KVCacheTensor(
size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer2"]
size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"]
),
],
kv_cache_groups=[
@@ -718,7 +718,7 @@ def test_get_kv_cache_configs_multiple_workers():
num_blocks=10,
kv_cache_tensors=[
KVCacheTensor(
size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer1"]
size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"]
),
],
kv_cache_groups=[
@@ -802,7 +802,7 @@ def test_get_kv_cache_configs_multiple_workers():
num_blocks=10,
kv_cache_tensors=[
KVCacheTensor(
size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer3"]
size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"]
),
],
kv_cache_groups=[
@@ -813,7 +813,7 @@ def test_get_kv_cache_configs_multiple_workers():
num_blocks=10,
kv_cache_tensors=[
KVCacheTensor(
size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer3"]
size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"]
),
],
kv_cache_groups=[