2026-02-11 13:38:11 +01:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
from tests.kernels.moe.utils import make_dummy_moe_config
|
|
|
|
|
from vllm.model_executor.layers.fused_moe.oracle.unquantized import (
|
|
|
|
|
UnquantizedMoeBackend,
|
|
|
|
|
select_unquantized_moe_backend,
|
|
|
|
|
)
|
2026-02-18 14:00:40 -06:00
|
|
|
from vllm.platforms import current_platform
|
2026-02-11 13:38:11 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"platform_method,expected_backend",
|
|
|
|
|
[
|
|
|
|
|
("is_cuda", UnquantizedMoeBackend.TRITON), # Default CUDA without FlashInfer
|
2026-03-25 13:46:40 -05:00
|
|
|
("is_rocm", UnquantizedMoeBackend.TRITON), # ROCm without AITER
|
2026-02-11 13:38:11 +01:00
|
|
|
("is_cpu", UnquantizedMoeBackend.CPU),
|
|
|
|
|
("is_xpu", UnquantizedMoeBackend.XPU),
|
|
|
|
|
("is_tpu", UnquantizedMoeBackend.TPU),
|
|
|
|
|
("is_out_of_tree", UnquantizedMoeBackend.OOT),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
@patch(
|
2026-03-31 15:43:33 -04:00
|
|
|
"vllm.utils.flashinfer.has_flashinfer",
|
2026-02-11 13:38:11 +01:00
|
|
|
return_value=False,
|
|
|
|
|
)
|
2026-03-25 13:46:40 -05:00
|
|
|
@patch(
|
|
|
|
|
"vllm.model_executor.layers.fused_moe.oracle.unquantized.rocm_aiter_ops.is_fused_moe_enabled",
|
|
|
|
|
return_value=False,
|
|
|
|
|
)
|
2026-02-11 13:38:11 +01:00
|
|
|
def test_select_default_backend_by_platform(
|
2026-03-25 13:46:40 -05:00
|
|
|
mock_aiter_enabled,
|
2026-02-11 13:38:11 +01:00
|
|
|
mock_has_flashinfer,
|
|
|
|
|
monkeypatch,
|
|
|
|
|
platform_method,
|
|
|
|
|
expected_backend,
|
|
|
|
|
):
|
2026-03-25 13:46:40 -05:00
|
|
|
"""Test default backend selection per platform with all optional
|
|
|
|
|
accelerators (FlashInfer, AITER) disabled."""
|
2026-02-11 13:38:11 +01:00
|
|
|
with patch(
|
|
|
|
|
"vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform"
|
|
|
|
|
) as mock_platform:
|
|
|
|
|
# Set all platform checks to False
|
|
|
|
|
mock_platform.is_cuda.return_value = False
|
|
|
|
|
mock_platform.is_rocm.return_value = False
|
|
|
|
|
mock_platform.is_cpu.return_value = False
|
|
|
|
|
mock_platform.is_xpu.return_value = False
|
|
|
|
|
mock_platform.is_tpu.return_value = False
|
|
|
|
|
mock_platform.is_out_of_tree.return_value = False
|
|
|
|
|
|
|
|
|
|
# Set only the specified platform to True
|
|
|
|
|
getattr(mock_platform, platform_method).return_value = True
|
|
|
|
|
|
2026-03-31 15:43:33 -04:00
|
|
|
with (
|
|
|
|
|
patch.object(current_platform, "is_cuda", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_rocm", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_cpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_xpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_tpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_out_of_tree", return_value=False),
|
|
|
|
|
patch.object(current_platform, platform_method, return_value=True),
|
|
|
|
|
):
|
2026-02-11 13:38:11 +01:00
|
|
|
moe_config = make_dummy_moe_config()
|
2026-03-31 15:43:33 -04:00
|
|
|
selected_backend, expert_cls = select_unquantized_moe_backend(
|
|
|
|
|
moe_config=moe_config
|
2026-02-11 13:38:11 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert selected_backend == expected_backend
|
2026-03-31 15:43:33 -04:00
|
|
|
if expected_backend in [
|
|
|
|
|
UnquantizedMoeBackend.CPU,
|
|
|
|
|
UnquantizedMoeBackend.OOT,
|
|
|
|
|
UnquantizedMoeBackend.TPU,
|
|
|
|
|
]:
|
|
|
|
|
assert expert_cls is None
|
|
|
|
|
else:
|
|
|
|
|
assert expert_cls is not None
|
2026-02-11 13:38:11 +01:00
|
|
|
|
|
|
|
|
|
2026-03-25 13:46:40 -05:00
|
|
|
@patch(
|
|
|
|
|
"vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
|
|
|
|
|
return_value=False,
|
|
|
|
|
)
|
|
|
|
|
@patch(
|
|
|
|
|
"vllm.model_executor.layers.fused_moe.oracle.unquantized.rocm_aiter_ops.is_fused_moe_enabled",
|
|
|
|
|
return_value=True,
|
|
|
|
|
)
|
|
|
|
|
@pytest.mark.skipif(
|
|
|
|
|
not current_platform.is_rocm(), reason="ROCm-specific backend selection test"
|
|
|
|
|
)
|
|
|
|
|
def test_select_rocm_aiter_backend(mock_aiter_enabled, mock_has_flashinfer):
|
|
|
|
|
"""Test ROCm backend selection when AITER is available."""
|
|
|
|
|
with patch(
|
|
|
|
|
"vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform"
|
|
|
|
|
) as mock_platform:
|
|
|
|
|
mock_platform.is_cuda.return_value = False
|
|
|
|
|
mock_platform.is_rocm.return_value = True
|
|
|
|
|
mock_platform.is_cpu.return_value = False
|
|
|
|
|
mock_platform.is_xpu.return_value = False
|
|
|
|
|
mock_platform.is_tpu.return_value = False
|
|
|
|
|
mock_platform.is_out_of_tree.return_value = False
|
|
|
|
|
|
|
|
|
|
moe_config = make_dummy_moe_config()
|
2026-03-31 15:43:33 -04:00
|
|
|
selected_backend, expert_cls = select_unquantized_moe_backend(
|
2026-03-25 13:46:40 -05:00
|
|
|
moe_config=moe_config,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert selected_backend == UnquantizedMoeBackend.AITER
|
2026-03-31 15:43:33 -04:00
|
|
|
assert expert_cls is not None
|
2026-03-25 13:46:40 -05:00
|
|
|
|
|
|
|
|
|
2026-02-11 13:38:11 +01:00
|
|
|
@patch(
|
2026-03-31 15:43:33 -04:00
|
|
|
"vllm.model_executor.layers.fused_moe.experts.trtllm_bf16_moe.TrtLlmBf16Experts.is_supported_config",
|
2026-02-11 13:38:11 +01:00
|
|
|
return_value=(True, None),
|
|
|
|
|
)
|
2026-02-18 14:00:40 -06:00
|
|
|
@pytest.mark.skipif(
|
|
|
|
|
not current_platform.is_cuda(), reason="Only supported on NVIDIA platforms."
|
|
|
|
|
)
|
2026-03-31 15:43:33 -04:00
|
|
|
def test_select_cuda_flashinfer_trtllm_backend(mock_is_supported_trtllm, monkeypatch):
|
2026-02-11 13:38:11 +01:00
|
|
|
"""Test CUDA backend selection when FlashInfer TRTLLM is available and enabled."""
|
2026-03-31 15:43:33 -04:00
|
|
|
with (
|
|
|
|
|
patch.object(current_platform, "is_cuda", return_value=True),
|
|
|
|
|
patch.object(current_platform, "is_rocm", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_cpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_xpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_tpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_out_of_tree", return_value=False),
|
|
|
|
|
patch.object(current_platform, "has_device_capability", return_value=True),
|
|
|
|
|
):
|
2026-02-11 13:38:11 +01:00
|
|
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
|
|
|
|
|
|
|
|
|
|
moe_config = make_dummy_moe_config()
|
2026-03-31 15:43:33 -04:00
|
|
|
# TRTLLM requires EP and does not support DP
|
|
|
|
|
moe_config.moe_parallel_config.use_ep = True
|
|
|
|
|
moe_config.moe_parallel_config.use_dp = False
|
2026-02-11 13:38:11 +01:00
|
|
|
|
2026-03-31 15:43:33 -04:00
|
|
|
selected_backend, experts_cls = select_unquantized_moe_backend(
|
|
|
|
|
moe_config=moe_config
|
2026-02-11 13:38:11 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert selected_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM
|
2026-03-31 15:43:33 -04:00
|
|
|
assert experts_cls is not None
|
2026-02-11 13:38:11 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@patch(
|
2026-03-31 15:43:33 -04:00
|
|
|
"vllm.utils.flashinfer.has_flashinfer",
|
2026-02-11 13:38:11 +01:00
|
|
|
return_value=True,
|
|
|
|
|
)
|
|
|
|
|
@patch(
|
2026-03-31 15:43:33 -04:00
|
|
|
"vllm.model_executor.layers.fused_moe.experts.trtllm_bf16_moe.TrtLlmBf16Experts.is_supported_config",
|
2026-02-11 13:38:11 +01:00
|
|
|
return_value=(False, None),
|
|
|
|
|
)
|
2026-03-31 15:43:33 -04:00
|
|
|
@patch(
|
|
|
|
|
"vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts.is_supported_config",
|
|
|
|
|
return_value=(True, None),
|
|
|
|
|
)
|
2026-02-18 14:00:40 -06:00
|
|
|
@pytest.mark.skipif(
|
|
|
|
|
not current_platform.is_cuda(), reason="Only supported on NVIDIA platforms."
|
|
|
|
|
)
|
2026-02-11 13:38:11 +01:00
|
|
|
def test_select_cuda_flashinfer_cutlass_backend(
|
2026-03-31 15:43:33 -04:00
|
|
|
mock_has_flashinfer,
|
|
|
|
|
mock_is_supported_trtllm,
|
|
|
|
|
mock_is_supported_cutlass,
|
|
|
|
|
monkeypatch,
|
2026-02-11 13:38:11 +01:00
|
|
|
):
|
|
|
|
|
"""Test CUDA backend selection when FlashInfer TRTLLM is not available
|
|
|
|
|
and FlashInfer CUTLASS is available."""
|
2026-03-31 15:43:33 -04:00
|
|
|
with (
|
|
|
|
|
patch.object(current_platform, "is_cuda", return_value=True),
|
|
|
|
|
patch.object(current_platform, "is_rocm", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_cpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_xpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_tpu", return_value=False),
|
|
|
|
|
patch.object(current_platform, "is_out_of_tree", return_value=False),
|
|
|
|
|
patch.object(current_platform, "has_device_capability", return_value=True),
|
|
|
|
|
):
|
2026-02-11 13:38:11 +01:00
|
|
|
# Enable FlashInfer via env var
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
|
|
|
|
|
|
|
|
|
|
moe_config = make_dummy_moe_config()
|
2026-03-31 15:43:33 -04:00
|
|
|
# CUTLASS requires EP and does not support DP
|
|
|
|
|
moe_config.moe_parallel_config.use_ep = True
|
|
|
|
|
moe_config.moe_parallel_config.use_dp = False
|
2026-02-11 13:38:11 +01:00
|
|
|
|
2026-03-31 15:43:33 -04:00
|
|
|
selected_backend, experts_cls = select_unquantized_moe_backend(
|
|
|
|
|
moe_config=moe_config
|
2026-02-11 13:38:11 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS
|
2026-03-31 15:43:33 -04:00
|
|
|
assert experts_cls is not None
|