[Config][Disaggregated] Add timeout configuration for the torch.store and add KVTransferConfig.kv_connector_extra_config (#14367)
Signed-off-by: Mathis Felardos <mathis@mistral.ai>
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
import dataclasses
|
||||
import datetime
|
||||
import pickle
|
||||
import time
|
||||
from collections import deque
|
||||
@@ -217,6 +218,7 @@ class StatelessProcessGroup:
|
||||
rank: int,
|
||||
world_size: int,
|
||||
data_expiration_seconds: int = 3600,
|
||||
store_timeout: int = 300,
|
||||
) -> "StatelessProcessGroup":
|
||||
"""A replacement for `torch.distributed.init_process_group` that does not
|
||||
pollute the global state.
|
||||
@@ -238,6 +240,7 @@ class StatelessProcessGroup:
|
||||
port=port,
|
||||
world_size=world_size,
|
||||
is_master=(rank == 0),
|
||||
timeout=datetime.timedelta(seconds=store_timeout),
|
||||
)
|
||||
|
||||
return StatelessProcessGroup(
|
||||
|
||||
Reference in New Issue
Block a user