Optimize KV cache distribution for asymmetric pipeline parallelism (#25164)

Signed-off-by: gholmes829 <g.holmes429@gmail.com>
2025-10-07 04:20:30 -05:00
parent 7e4cd070b0
commit d100d78eb3
5 changed files with 64 additions and 38 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -253,10 +253,10 @@ class Worker(WorkerBase):
            self.model_runner.profile_run()

            msg = (
-                f"Initial free memory {GiB(self.init_snapshot.free_memory)} "
-                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for "
+                f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
+                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
                "KV Cache as specified by kv_cache_memory_bytes config and "
-                "skipped memory profiling. This does does not respect the "
+                "skipped memory profiling. This does not respect the "
                "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
                "config when you want manual control of KV cache memory "
                "size. If OOM'ed, check the difference of initial free "