vllm/tests/kernels/moe/test_triton_moe_no_act_mul.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test for is_act_and_mul=False MoE using Triton.

This tests the code path used by models like Nemotron-H that use
non-fused activations (e.g., relu2_no_mul) instead of SwiGLU-style
fused activations.

This feature is supported on both CUDA and ROCm (with AITER disabled).
"""

import pytest
import torch

from vllm.platforms import current_platform

pytestmark = pytest.mark.skipif(
    not current_platform.is_cuda_alike(),
    reason="Tests for is_act_and_mul=False MoE require CUDA or ROCm",
)


@pytest.fixture
def disable_aiter_on_rocm(monkeypatch):
    """Fixture to disable AITER on ROCm to use Triton path."""
    if current_platform.is_rocm():
        from vllm._aiter_ops import rocm_aiter_ops

        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "0")
        monkeypatch.setenv("VLLM_ROCM_USE_AITER_MOE", "0")
        rocm_aiter_ops.refresh_env_variables()

        yield

        rocm_aiter_ops.refresh_env_variables()
    else:
        # On CUDA, no special setup needed
        yield


@pytest.fixture
def init_workspace():
    """Initialize workspace manager for MoE tests."""
    from vllm.v1.worker.workspace import (
        init_workspace_manager,
        reset_workspace_manager,
    )

    torch.manual_seed(42)
    init_workspace_manager(torch.cuda.current_device())

    yield

    reset_workspace_manager()


@pytest.mark.parametrize("m", [1, 33, 64, 222])
@pytest.mark.parametrize("n", [128, 256, 1024])
@pytest.mark.parametrize("k", [128, 512])
@pytest.mark.parametrize("e", [4, 8])
@pytest.mark.parametrize("topk", [2])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("activation", ["relu2_no_mul", "silu_no_mul", "gelu_no_mul"])
@torch.inference_mode()
def test_moe_no_act_mul(
    disable_aiter_on_rocm,
    init_workspace,
    m: int,
    n: int,
    k: int,
    e: int,
    topk: int,
    dtype: torch.dtype,
    activation: str,
):
    """Test MoE with is_act_and_mul=False using Triton."""
    from vllm.model_executor.layers.fused_moe import TritonExperts, fused_topk
    from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
    from vllm.model_executor.layers.fused_moe.modular_kernel import (
        FusedMoEModularKernel,
    )
    from vllm.model_executor.layers.fused_moe.prepare_finalize import (
        MoEPrepareAndFinalizeNoEP,
    )

    a = torch.randn((m, k), device="cuda", dtype=dtype)
    w1 = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10

    quant_config = FusedMoEQuantConfig.make(is_act_and_mul=False)

    score = torch.randn((m, e), device="cuda", dtype=dtype)
    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=True)

    fused_experts = FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(),
        TritonExperts(quant_config),
    )

    output = fused_experts(
        hidden_states=a,
        w1=w1,
        w2=w2,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        activation=activation,
    )

    assert output.shape == (m, k), f"Expected shape {(m, k)}, got {output.shape}"
    assert not torch.isnan(output).any(), "Output contains NaN"
    assert not torch.isinf(output).any(), "Output contains Inf"
    assert output.abs().sum() > 0, "Output is all zeros"


@torch.inference_mode()
def test_moe_workspace_shapes_no_act_mul(disable_aiter_on_rocm):
    """Test workspace_shapes returns correct sizes for is_act_and_mul=False."""
    from vllm.model_executor.layers.fused_moe import TritonExperts
    from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig

    M, N, K, topk = 64, 256, 128, 2

    quant_config = FusedMoEQuantConfig.make(is_act_and_mul=False)
    experts = TritonExperts(quant_config)
    ws1, ws2, out = experts.workspace_shapes(M, N, K, topk, 8, 8, None)

    assert ws1[2] == max(N, K)
    assert out == (M, K)