[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2025-12-24 00:42:30 -06:00
committed by GitHub
parent 8ee90c83f8
commit 0247a91e00
26 changed files with 432 additions and 116 deletions

View File

@@ -14,16 +14,26 @@ import pytest_asyncio
import soundfile as sf
from ...utils import RemoteOpenAIServer
from .conftest import add_attention_backend
SERVER_ARGS = ["--enforce-eager"]
def _get_server_args(attention_config):
"""Get server args with attention backend if specified."""
args = SERVER_ARGS.copy()
add_attention_backend(args, attention_config)
return args
@pytest.fixture(
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
)
def server(request):
def server(request, rocm_aiter_fa_attention):
# Parametrize over model name
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
with RemoteOpenAIServer(
request.param, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
yield remote_server, request.param
@@ -35,10 +45,12 @@ async def client_and_model(server):
@pytest.mark.asyncio
async def test_non_asr_model(foscolo):
async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
# text to text model
model_name = "JackFram/llama-68m"
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
with RemoteOpenAIServer(
model_name, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
client = remote_server.get_async_client()
res = await client.audio.translations.create(
model=model_name, file=foscolo, temperature=0.0
@@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
@pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb):
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
"""Ensure STT (translate) requests can pass LoRA through to generate."""
# ROCm SPECIFIC CONFIGURATION:
# To ensure the test passes on ROCm, we modify the max model length to 512.
# We DO NOT apply this to other platforms to maintain strict upstream parity.
from vllm.platforms import current_platform
# NOTE - careful to call this test before the module scoped server
# fixture, otherwise it'll OOMkill the CI
model_name = "ibm-granite/granite-speech-3.3-2b"
@@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
"--lora-modules",
f"{lora_model_name}={model_name}",
"--max-model-len",
"2048",
"512" if current_platform.is_rocm() else "2048",
"--max-num-seqs",
"1",
]
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()