[Feature] Support Pipeline Parallism in torchrun SPMD offline inference for V1 (#17827)

Signed-off-by: Lucia Fang <fanglu@fb.com>
This commit is contained in:
Lucia Fang
2025-05-15 22:28:27 -07:00
committed by GitHub
parent 6b31c84aff
commit 3d2779c29a
9 changed files with 55 additions and 27 deletions

View File

@@ -8,6 +8,8 @@ the argument 2 should match the `tensor_parallel_size` below.
see `tests/distributed/test_torchrun_example.py` for the unit test.
"""
import torch.distributed as dist
from vllm import LLM, SamplingParams
# Create prompts, the same across all ranks
@@ -27,23 +29,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# all ranks have the same random seed, so that sampling can be
# deterministic across ranks.
llm = LLM(
model="facebook/opt-125m",
model="meta-llama/Llama-3.1-8B",
tensor_parallel_size=2,
pipeline_parallel_size=2,
distributed_executor_backend="external_launcher",
seed=0,
max_model_len=32768,
seed=1,
)
outputs = llm.generate(prompts, sampling_params)
# all ranks will have the same outputs
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
if dist.get_rank() == 0:
print("-" * 50)
"""
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}\n")
print("-" * 50)
"""
Further tips:
1. to communicate control messages across all ranks, use the cpu group,