[V1] AsyncLLM data parallel (#13923)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill
2025-03-27 16:14:41 -07:00
committed by GitHub
parent 112b3e5b3b
commit 15dac210f0
18 changed files with 722 additions and 156 deletions

View File

@@ -28,6 +28,7 @@ Multi-node:
--master-port=13345
"""
import os
from time import sleep
from vllm import LLM, SamplingParams
from vllm.utils import get_open_port
@@ -36,14 +37,13 @@ from vllm.utils import get_open_port
def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
dp_master_port, GPUs_per_dp_rank):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
os.environ["VLLM_DP_SIZE"] = str(dp_size)
os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
# set devices for each dp_rank
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
str(i)
for i in range(local_dp_rank * GPUs_per_dp_rank, (local_dp_rank + 1) *
GPUs_per_dp_rank))
# CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
# engine processes.
# Sample prompts.
prompts = [
@@ -90,6 +90,9 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
f"Generated text: {generated_text!r}")
# Give engines time to pause their processing loops before exiting.
sleep(1)
if __name__ == "__main__":
import argparse
@@ -152,8 +155,13 @@ if __name__ == "__main__":
procs.append(proc)
exit_code = 0
for proc in procs:
proc.join()
if proc.exitcode:
proc.join(timeout=300)
if proc.exitcode is None:
print(f"Killing process {proc.pid} that "
f"didn't stop within 5 minutes.")
proc.kill()
exit_code = 1
elif proc.exitcode:
exit_code = proc.exitcode
exit(exit_code)