From 15422ed3f748b2c26a9446db388979027a254199 Mon Sep 17 00:00:00 2001 From: Ryan Rock Date: Wed, 14 Jan 2026 22:01:42 -0600 Subject: [PATCH] [CI/Build][Hardware][AMD] Fix v1/shutdown (#31997) Signed-off-by: Ryan Rock --- tests/v1/shutdown/conftest.py | 26 +++++++++++++++++++++++++ tests/v1/shutdown/test_forward_error.py | 20 +++++++++++++++++-- tests/v1/shutdown/test_startup_error.py | 21 +++++++++++++++++++- 3 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 tests/v1/shutdown/conftest.py diff --git a/tests/v1/shutdown/conftest.py b/tests/v1/shutdown/conftest.py new file mode 100644 index 000000000..b276d0fab --- /dev/null +++ b/tests/v1/shutdown/conftest.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from collections.abc import Iterable +from pathlib import Path + +import pytest + +from vllm.platforms import current_platform + + +@pytest.fixture +def rocm_sitecustomize_factory(monkeypatch, tmp_path: Path): + """Return a function that installs a given sitecustomize payload.""" + if not current_platform.is_rocm(): + return lambda _: None + + def install(lines: Iterable[str]) -> None: + sc = tmp_path / "sitecustomize.py" + sc.write_text("\n".join(lines) + "\n") + monkeypatch.setenv( + "PYTHONPATH", + os.pathsep.join(filter(None, [str(tmp_path), os.getenv("PYTHONPATH")])), + ) + + return install diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index a751b2d91..4625bc174 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -3,6 +3,7 @@ """Test that we handle an Error in model forward and shutdown.""" import asyncio +import inspect import pytest @@ -38,11 +39,22 @@ def evil_forward(self, *args, **kwargs): return self.model(*args, **kwargs) +@pytest.fixture +def rocm_evil_forward(rocm_sitecustomize_factory): + lines = [ + "from vllm.distributed import get_tensor_model_parallel_rank", + "from vllm.model_executor.models.llama import LlamaForCausalLM", + inspect.getsource(evil_forward), + f"LlamaForCausalLM.forward = {evil_forward.__name__}", + ] + rocm_sitecustomize_factory(lines) + + @pytest.mark.asyncio @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("model", MODELS) async def test_async_llm_model_error( - monkeypatch, tensor_parallel_size: int, model: str + monkeypatch, rocm_evil_forward, tensor_parallel_size: int, model: str ) -> None: """Test that AsyncLLM propagates a forward pass error and frees memory. @@ -104,7 +116,11 @@ async def test_async_llm_model_error( @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("model", MODELS) def test_llm_model_error( - monkeypatch, tensor_parallel_size: int, enable_multiprocessing: bool, model: str + monkeypatch, + rocm_evil_forward, + tensor_parallel_size: int, + enable_multiprocessing: bool, + model: str, ) -> None: """Test that LLM propagates a forward pass error and frees memory. TODO(andy) - LLM without multiprocessing; LLM with multiprocessing diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index c1594cc2e..7925dc14b 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that we handle a startup Error and shutdown.""" +import inspect + import pytest from tests.utils import wait_for_gpu_memory_to_clear @@ -28,12 +30,28 @@ def evil_method(self, *args, **kwargs): return self.model(*args, **kwargs, intermediate_tensors=None) +@pytest.fixture +def rocm_evil_method(rocm_sitecustomize_factory, request): + failing_method = request.getfixturevalue("failing_method") + lines = [ + "from vllm.distributed import get_tensor_model_parallel_rank", + "from vllm.model_executor.models.llama import LlamaForCausalLM", + inspect.getsource(evil_method), + f"LlamaForCausalLM.{failing_method} = {evil_method.__name__}", + ] + rocm_sitecustomize_factory(lines) + + @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("failing_method", ["forward", "load_weights"]) def test_async_llm_startup_error( - monkeypatch, model: str, tensor_parallel_size: int, failing_method: str + monkeypatch, + rocm_evil_method, + model: str, + tensor_parallel_size: int, + failing_method: str, ) -> None: """Test that AsyncLLM propagates an __init__ error & frees memory. Test profiling (forward()) and load weights failures. @@ -67,6 +85,7 @@ def test_async_llm_startup_error( @pytest.mark.parametrize("failing_method", ["forward", "load_weights"]) def test_llm_startup_error( monkeypatch, + rocm_evil_method, model: str, tensor_parallel_size: int, enable_multiprocessing: bool,