[V1] AsyncLLM data parallel (#13923)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill
2025-03-27 16:14:41 -07:00
committed by GitHub
parent 112b3e5b3b
commit 15dac210f0
18 changed files with 722 additions and 156 deletions

View File

@@ -15,6 +15,8 @@ import torch
from torch.distributed import ProcessGroup, TCPStore
from torch.distributed.distributed_c10d import (Backend, PrefixStore,
_get_default_timeout,
_shutdown_backend,
_unregister_process_group,
is_nccl_available)
from torch.distributed.rendezvous import rendezvous
@@ -333,3 +335,13 @@ def stateless_init_torch_distributed_process_group(
pg._register_backend(device, backend_type, backend_class)
return pg
def stateless_destroy_torch_distributed_process_group(
pg: ProcessGroup) -> None:
"""
Destroy ProcessGroup returned by
stateless_init_torch_distributed_process_group().
"""
_shutdown_backend(pg)
_unregister_process_group(pg.group_name)