Optimize KV cache distribution for asymmetric pipeline parallelism (#25164)

Signed-off-by: gholmes829 <g.holmes429@gmail.com>
2025-10-07 04:20:30 -05:00
parent 7e4cd070b0
commit d100d78eb3
5 changed files with 64 additions and 38 deletions
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -124,7 +124,7 @@ class CacheConfig:
    gpu_memory_utilization. However, users may want to manually specify
    the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
    control of how much memory gets used when compared with using
-    gpu_memory_memory_utilization. Note that kv_cache_memory_bytes
+    gpu_memory_utilization. Note that kv_cache_memory_bytes
    (when not-None) ignores gpu_memory_utilization"""

    def compute_hash(self) -> str: