[Feature] Add --distributed-timeout-seconds CLI option (#36047)

Signed-off-by: Shiyan Deng <dsy842974287@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
This commit is contained in:
Shiyan Deng
2026-03-05 20:57:51 -08:00
committed by GitHub
parent 8e87cc57f1
commit 03a49bb8f0
3 changed files with 25 additions and 2 deletions

View File

@@ -6,6 +6,7 @@ import gc
import os
from collections.abc import Callable
from contextlib import AbstractContextManager, nullcontext
from datetime import timedelta
from types import NoneType
from typing import TYPE_CHECKING, Any
@@ -942,8 +943,18 @@ def init_worker_distributed_environment(
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
init_method = distributed_init_method or "env://"
timeout = None
if parallel_config.distributed_timeout_seconds is not None:
timeout = timedelta(seconds=parallel_config.distributed_timeout_seconds)
init_distributed_environment(
parallel_config.world_size, rank, init_method, local_rank, backend
parallel_config.world_size,
rank,
init_method,
local_rank,
backend,
timeout,
)
ensure_model_parallel_initialized(