[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2025-12-24 00:42:30 -06:00
parent 8ee90c83f8
commit 0247a91e00
26 changed files with 432 additions and 116 deletions
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -14,16 +14,26 @@ import pytest_asyncio
 import soundfile as sf

 from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend

 SERVER_ARGS = ["--enforce-eager"]


+def _get_server_args(attention_config):
+    """Get server args with attention backend if specified."""
+    args = SERVER_ARGS.copy()
+    add_attention_backend(args, attention_config)
+    return args
+
+
@pytest.fixture(
    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
 )
-def server(request):
+def server(request, rocm_aiter_fa_attention):
    # Parametrize over model name
-    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        request.param, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
        yield remote_server, request.param


@@ -35,10 +45,12 @@ async def client_and_model(server):


@pytest.mark.asyncio
-async def test_non_asr_model(foscolo):
+async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
    # text to text model
    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
        client = remote_server.get_async_client()
        res = await client.audio.translations.create(
            model=model_name, file=foscolo, temperature=0.0
@@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):


@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
    # NOTE - careful to call this test before the module scoped server
    # fixture, otherwise it'll OOMkill the CI
    model_name = "ibm-granite/granite-speech-3.3-2b"
@@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()