[ROCm][CI] Fix Whisper translation test attention backend selection (#38508)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-03-31 00:21:49 -05:00
parent 3e802e8786
commit b9cdc85207
1 changed files with 39 additions and 6 deletions
--- a/tests/entrypoints/openai/speech_to_text/test_translation_validation.py
+++ b/tests/entrypoints/openai/speech_to_text/test_translation_validation.py
@@ -16,10 +16,41 @@ import soundfile as sf

 from tests.entrypoints.openai.conftest import add_attention_backend
 from tests.utils import RemoteOpenAIServer
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)

 SERVER_ARGS = ["--enforce-eager"]


+def _get_rocm_attention_config(model_name):
+    """Return appropriate ROCm attention config for the given model.
+
+    Whisper uses cross-attention (ENCODER_DECODER) which ROCM_AITER_FA does
+    not support. For Whisper we use ROCM_AITER_UNIFIED_ATTN (or TRITON_ATTN
+    as fallback); other models can use ROCM_AITER_FA.
+    """
+    from vllm.platforms import current_platform
+
+    if not current_platform.is_rocm():
+        return None
+
+    if "whisper" in model_name.lower():
+        try:
+            from vllm.platforms.rocm import _ON_MI3XX
+
+            if _ON_MI3XX:
+                return {"backend": "ROCM_AITER_UNIFIED_ATTN"}
+        except ImportError:
+            logger.warning(
+                "Could not import _ON_MI3XX from rocm platform, "
+                "falling back to TRITON_ATTN for Whisper."
+            )
+        return {"backend": "TRITON_ATTN"}
+
+    return {"backend": "ROCM_AITER_FA"}
+
+
 def _get_server_args(attention_config):
    """Get server args with attention backend if specified."""
    args = SERVER_ARGS.copy()
@@ -30,10 +61,11 @@ def _get_server_args(attention_config):
@pytest.fixture(
    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
 )
-def server(request, rocm_aiter_fa_attention):
+def server(request):
    # Parametrize over model name
+    attention_config = _get_rocm_attention_config(request.param)
    with RemoteOpenAIServer(
-        request.param, _get_server_args(rocm_aiter_fa_attention)
+        request.param, _get_server_args(attention_config)
    ) as remote_server:
        yield remote_server, request.param

@@ -46,11 +78,12 @@ async def client_and_model(server):


@pytest.mark.asyncio
-async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
+async def test_non_asr_model(foscolo):
    # text to text model
    model_name = "JackFram/llama-68m"
+    attention_config = _get_rocm_attention_config(model_name)
    with RemoteOpenAIServer(
-        model_name, _get_server_args(rocm_aiter_fa_attention)
+        model_name, _get_server_args(attention_config)
    ) as remote_server:
        client = remote_server.get_async_client()

@@ -61,7 +94,7 @@ async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):


@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
+async def test_basic_audio_with_lora(mary_had_lamb):
    """Ensure STT (translate) requests can pass LoRA through to generate."""
    # ROCm SPECIFIC CONFIGURATION:
    # To ensure the test passes on ROCm, we modify the max model length to 512.
@@ -85,7 +118,7 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
        "1",
    ]

-    add_attention_backend(server_args, rocm_aiter_fa_attention)
+    add_attention_backend(server_args, _get_rocm_attention_config(model_name))

    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server: