diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py index cf06b000e..78134ae38 100644 --- a/tests/model_executor/model_loader/test_sharded_state_loader.py +++ b/tests/model_executor/model_loader/test_sharded_state_loader.py @@ -13,6 +13,7 @@ from huggingface_hub import snapshot_download from vllm import LLM, SamplingParams from vllm.model_executor.model_loader import ShardedStateLoader +from vllm.platforms import current_platform prompts = [ "Hello, my name is", @@ -95,6 +96,10 @@ def test_sharded_state_loader( input_dir = llama_3p2_1b_files ctx = mp.get_context("spawn") + platform_args = {} + if current_platform.is_rocm(): + platform_args["max_num_seqs"] = 1 + # Run in separate processes for memory & CUDA isolation with TemporaryDirectory() as output_dir: p = ctx.Process( @@ -104,6 +109,7 @@ def test_sharded_state_loader( tensor_parallel_size=tp_size, gpu_memory_utilization=gpu_memory_utilization, enforce_eager=True, + **platform_args, ), ) p.start() @@ -118,6 +124,7 @@ def test_sharded_state_loader( enable_lora=enable_lora, gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, + **platform_args, ), ) p.start() @@ -141,6 +148,7 @@ def test_sharded_state_loader( gpu_memory_utilization=gpu_memory_utilization, tensor_parallel_size=tp_size, load_format="sharded_state", + **platform_args, ), ) p.start()