From 6c64c41b4abfe10f725088afb1fe143d412dc118 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Fri, 30 Jan 2026 22:28:29 -0600 Subject: [PATCH] [ROCm][CI] Force max_num_seqs=1 on ROCm In test_sharded_state_loader to reduce flakiness (#33277) Signed-off-by: Micah Williamson --- .../model_loader/test_sharded_state_loader.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py index cf06b000e..78134ae38 100644 --- a/tests/model_executor/model_loader/test_sharded_state_loader.py +++ b/tests/model_executor/model_loader/test_sharded_state_loader.py @@ -13,6 +13,7 @@ from huggingface_hub import snapshot_download from vllm import LLM, SamplingParams from vllm.model_executor.model_loader import ShardedStateLoader +from vllm.platforms import current_platform prompts = [ "Hello, my name is", @@ -95,6 +96,10 @@ def test_sharded_state_loader( input_dir = llama_3p2_1b_files ctx = mp.get_context("spawn") + platform_args = {} + if current_platform.is_rocm(): + platform_args["max_num_seqs"] = 1 + # Run in separate processes for memory & CUDA isolation with TemporaryDirectory() as output_dir: p = ctx.Process( @@ -104,6 +109,7 @@ def test_sharded_state_loader( tensor_parallel_size=tp_size, gpu_memory_utilization=gpu_memory_utilization, enforce_eager=True, + **platform_args, ), ) p.start() @@ -118,6 +124,7 @@ def test_sharded_state_loader( enable_lora=enable_lora, gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, + **platform_args, ), ) p.start() @@ -141,6 +148,7 @@ def test_sharded_state_loader( gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, load_format="sharded_state", + **platform_args, ), ) p.start()