[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -14,16 +14,26 @@ import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .conftest import add_attention_backend
|
||||
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
|
||||
|
||||
def _get_server_args(attention_config):
|
||||
"""Get server args with attention backend if specified."""
|
||||
args = SERVER_ARGS.copy()
|
||||
add_attention_backend(args, attention_config)
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
|
||||
)
|
||||
def server(request):
|
||||
def server(request, rocm_aiter_fa_attention):
|
||||
# Parametrize over model name
|
||||
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
request.param, _get_server_args(rocm_aiter_fa_attention)
|
||||
) as remote_server:
|
||||
yield remote_server, request.param
|
||||
|
||||
|
||||
@@ -35,10 +45,12 @@ async def client_and_model(server):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_asr_model(foscolo):
|
||||
async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
|
||||
# text to text model
|
||||
model_name = "JackFram/llama-68m"
|
||||
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
model_name, _get_server_args(rocm_aiter_fa_attention)
|
||||
) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
res = await client.audio.translations.create(
|
||||
model=model_name, file=foscolo, temperature=0.0
|
||||
@@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||
"""Ensure STT (translate) requests can pass LoRA through to generate."""
|
||||
# ROCm SPECIFIC CONFIGURATION:
|
||||
# To ensure the test passes on ROCm, we modify the max model length to 512.
|
||||
# We DO NOT apply this to other platforms to maintain strict upstream parity.
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# NOTE - careful to call this test before the module scoped server
|
||||
# fixture, otherwise it'll OOMkill the CI
|
||||
model_name = "ibm-granite/granite-speech-3.3-2b"
|
||||
@@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
|
||||
"--lora-modules",
|
||||
f"{lora_model_name}={model_name}",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"512" if current_platform.is_rocm() else "2048",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
|
||||
Reference in New Issue
Block a user