[Bugfix] Fix torchrun PP broadcast deadlock with async scheduling (#33701)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-02-04 10:17:37 +08:00
parent 1b8fe6f7c4
commit 02080179a3
3 changed files with 4 additions and 7 deletions
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -32,9 +32,6 @@ llm = LLM(
    gpu_memory_utilization=random.uniform(0.7, 0.9),
    swap_space=random.randint(1, 4),
    seed=0,
-    # FIXME(Isotr0py): async scheduling causes deadlock
-    # on torchrun with PP, need to investigate further.
-    async_scheduling=False,
 )

 outputs = llm.generate(prompts, sampling_params)
--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@@ -39,9 +39,6 @@ llm = LLM(
    gpu_memory_utilization=random.uniform(0.7, 0.9),
    swap_space=random.randint(1, 4),
    seed=0,
-    # FIXME(Isotr0py): async scheduling causes deadlock
-    # on torchrun with PP, need to investigate further.
-    async_scheduling=False,
 )

 outputs = llm.generate(prompts, sampling_params)
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3666,7 +3666,10 @@ class GPUModelRunner(
        )
        if self.use_async_scheduling:
            pp = get_pp_group()
-            if pp.world_size > 1 and pp.is_last_rank:
+            # For torchrun external_launcher PP mode with broadcast_pp_output=True,
+            # PP outputs have been broadcasted to all ranks at logits computation.
+            # Therefore, here is no need to send sampled token ids again in this case.
+            if not self.broadcast_pp_output and pp.world_size > 1 and pp.is_last_rank:
                self._pp_broadcast_prev_sampled_token_ids(
                    sampler_output.sampled_token_ids
                )