diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 06a0b5212..47658e505 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -149,7 +149,7 @@ steps: num_devices: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py - # - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 83e12710a..842e76549 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1006,6 +1006,10 @@ class Worker(WorkerBase): load_weights=load_weights_direct, ) + # NCCL broadcast/packed path are asynchronous. + # Sync here so the next step uses the new weights. + torch.accelerator.synchronize() + def shutdown(self) -> None: # has_kv_transfer_group can be None during interpreter shutdown. if ensure_kv_transfer_shutdown is not None: