[ROCm][CI] Fix tests/compile unit tests (#28895)

Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
Signed-off-by: Charlie Fu <Charlie.Fu@amd.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
Charlie Fu
2026-01-06 12:50:43 -06:00
committed by GitHub
parent f7008ce1c4
commit c07163663d
3 changed files with 30 additions and 13 deletions

View File

@@ -5,10 +5,13 @@ import dataclasses
import pytest import pytest
from vllm.config import CompilationMode from vllm.config import CompilationMode
from vllm.platforms import current_platform
from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.utils.torch_utils import cuda_device_count_stateless
from ...utils import compare_all_settings from ...utils import compare_all_settings
ATTN_BACKEND = "FLASH_ATTN" if not current_platform.is_rocm() else "ROCM_ATTN"
@dataclasses.dataclass @dataclasses.dataclass
class TestSetting: class TestSetting:
@@ -31,7 +34,7 @@ class TestSetting:
model_args=["--max-model-len", "2048"], model_args=["--max-model-len", "2048"],
pp_size=2, pp_size=2,
tp_size=2, tp_size=2,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="generate", method="generate",
), ),
# llama model with quantization # llama model with quantization
@@ -40,7 +43,7 @@ class TestSetting:
model_args=["--quantization", "gptq", "--max-model-len", "2048"], model_args=["--quantization", "gptq", "--max-model-len", "2048"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="generate", method="generate",
), ),
# MoE model # MoE model
@@ -49,7 +52,7 @@ class TestSetting:
model_args=["--max-model-len", "2048"], model_args=["--max-model-len", "2048"],
pp_size=1, pp_size=1,
tp_size=2, tp_size=2,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="generate", method="generate",
), ),
# embedding model # embedding model
@@ -65,16 +68,22 @@ class TestSetting:
], ],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
attn_backend="FLASH_ATTN", attn_backend=ATTN_BACKEND,
method="encode", method="encode",
), ),
TestSetting( pytest.param(
model="BAAI/bge-base-en-v1.5", TestSetting(
model_args=["--runner", "pooling"], model="BAAI/bge-base-en-v1.5",
pp_size=1, model_args=["--runner", "pooling"],
tp_size=1, pp_size=1,
attn_backend="FLASH_ATTN", tp_size=1,
method="encode", attn_backend="FLASH_ATTN",
method="encode",
),
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="Encoder self-attention is not implemented for ROCm",
),
), ),
# vision language model # vision language model
# See https://github.com/vllm-project/vllm/issues/26716. # See https://github.com/vllm-project/vllm/issues/26716.

View File

@@ -9,6 +9,7 @@ import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import CompilationConfig from vllm.config import CompilationConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -70,6 +71,10 @@ def llm_pair(request):
elif backend_config.specific_gpu_arch == (10, 0): elif backend_config.specific_gpu_arch == (10, 0):
pytest.skip("Only Blackwell GPUs support Cutlass MLA") pytest.skip("Only Blackwell GPUs support Cutlass MLA")
# FlashInfer is not supported on ROCm
if backend_config == AttentionBackendEnum.FLASHINFER and current_platform.is_rocm():
pytest.skip("FlashInfer is not supported on ROCm")
env_vars = { env_vars = {
# Force native sampler to avoid potential nondeterminism in FlashInfer # Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1. # when per-request generators are not used in V1.

View File

@@ -25,10 +25,13 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
class Model(torch.nn.Module): class Model(torch.nn.Module):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__() super().__init__()
self.pos_embed = torch.empty(buffer_size, hidden_size, dtype=dtype) # Avoid using empty, since on rocm torch.empty
# does not initialize the memory.
self.pos_embed = torch.randn(buffer_size, hidden_size, dtype=dtype)
def forward(self, x): def forward(self, x):
x += self.pos_embed[: x.shape[0]] # Avoid += to prevent inplace addition.
x = x + self.pos_embed[: x.shape[0]]
# Chain of reshapes # Chain of reshapes
y = x.reshape(-1, 128, 32) y = x.reshape(-1, 128, 32)
z = y.reshape(-1, 4096) z = y.reshape(-1, 4096)