[Bugfix] Fix inner_dp_world initialization order for multi-node TP (#35892)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -67,6 +67,7 @@ steps:
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
- tests/distributed/test_multiproc_executor.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
@@ -95,6 +96,8 @@ steps:
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# test multi-node TP with multiproc executor (simulated on single node)
|
||||
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
|
||||
@@ -9,11 +9,11 @@ focusing on executor initialization, RPC calls, and distributed execution.
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
import socket
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import get_open_port
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
|
||||
|
||||
@@ -333,7 +333,9 @@ def test_multiproc_executor_multi_node():
|
||||
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
|
||||
Total world_size = 4, nnodes = 2
|
||||
"""
|
||||
port = get_open_port()
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("", 0))
|
||||
port = s.getsockname()[1]
|
||||
# symm_mem does not work for simulating multi instance in single node
|
||||
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
||||
|
||||
|
||||
@@ -608,7 +608,6 @@ class WorkerProc:
|
||||
)
|
||||
|
||||
# Load model
|
||||
self._init_message_queues(input_shm_handle, vllm_config)
|
||||
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
|
||||
if not is_eep_new_worker:
|
||||
self.worker.init_device()
|
||||
@@ -618,6 +617,10 @@ class WorkerProc:
|
||||
)
|
||||
self.worker.load_model()
|
||||
|
||||
# Initialize message queues after init_device() since multi-node setups
|
||||
# (nnodes_within_dp > 1) require distributed groups to be initialized
|
||||
self._init_message_queues(input_shm_handle, vllm_config)
|
||||
|
||||
# Enable environment variable cache (e.g. assume no more
|
||||
# environment variable overrides after this point)
|
||||
enable_envs_cache()
|
||||
|
||||
Reference in New Issue
Block a user