[core] platform agnostic executor via collective_rpc (#11256)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-15 13:45:21 +08:00
parent f218f9c24d
commit ad34c0df0f
43 changed files with 851 additions and 2641 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -33,6 +33,7 @@ class Worker:
        local_rank: int,
        rank: int,
        distributed_init_method: str,
+        is_driver_worker: bool = False,
    ):

        # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
@@ -75,7 +76,7 @@ class Worker:
        else:
            self.profiler = None

-    def initialize(self):
+    def init_device(self):
        if self.device_config.device.type == "cuda":
            # torch.distributed.all_reduce does not free the input tensor until
            # the synchronization point. This causes the memory usage to grow