[core] platform agnostic executor via collective_rpc (#11256)

Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
youkaichao
2025-01-15 13:45:21 +08:00
committed by GitHub
parent f218f9c24d
commit ad34c0df0f
43 changed files with 851 additions and 2641 deletions

View File

@@ -33,6 +33,7 @@ class Worker:
local_rank: int,
rank: int,
distributed_init_method: str,
is_driver_worker: bool = False,
):
# TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
@@ -75,7 +76,7 @@ class Worker:
else:
self.profiler = None
def initialize(self):
def init_device(self):
if self.device_config.device.type == "cuda":
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow