Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)

Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-04 08:26:47 -07:00
parent 8f4ffbd373
commit c8dcc15921
10 changed files with 97 additions and 5 deletions
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+from typing import Optional
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+
+"""
+To run this example, run the following commands simultaneously with
+different CUDA_VISIBLE_DEVICES:
+    python examples/online_serving/multi_instance_data_parallel.py
+
+    vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \
+        --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \
+        --data-parallel-size-local 1 --enforce-eager --headless
+
+Once both instances have completed the handshake, this example will
+send a request to the instance with DP rank 1.
+"""
+
+
+async def main():
+    engine_args = AsyncEngineArgs(
+        model="ibm-research/PowerMoE-3b",
+        data_parallel_size=2,
+        dtype="auto",
+        max_model_len=2048,
+        data_parallel_address="127.0.0.1",
+        data_parallel_rpc_port=62300,
+        data_parallel_size_local=1,
+        enforce_eager=True,
+    )
+
+    engine_client = AsyncLLMEngine.from_engine_args(engine_args)
+
+    sampling_params = SamplingParams(
+        temperature=0.7,
+        top_p=0.9,
+        max_tokens=100,
+    )
+
+    prompt = "Who won the 2004 World Series?"
+    final_output: Optional[RequestOutput] = None
+    async for output in engine_client.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        request_id="abcdef",
+        data_parallel_rank=1,
+    ):
+        final_output = output
+    if final_output:
+        print(final_output.outputs[0].text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())