[MoE Refactor][16/N] Apply Refactor to NVFP4 (#31692)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
Robert Shaw
2026-01-07 22:46:27 -05:00
committed by GitHub
parent 8dd2419fa9
commit 9f6dcb71ae
15 changed files with 777 additions and 681 deletions

View File

@@ -3,6 +3,7 @@
import pytest
import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_test_weights
from tests.kernels.quantization.nvfp4_utils import (
FLOAT4_E2M1_MAX,
@@ -13,8 +14,13 @@ from tests.kernels.utils import torch_moe
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
CutlassExpertsFp4,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
@@ -83,17 +89,21 @@ def test_cutlass_fp4_moe_no_graph(
w2_scale=w2_blockscale,
)
cutlass_output = cutlass_moe_fp4(
a=a,
w1_fp4=w1_q,
w2_fp4=w2_q,
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
CutlassExpertsFp4(
out_dtype=dtype,
max_experts_per_worker=e,
quant_config=quant_config,
),
)
cutlass_output = kernel(
hidden_states=a,
w1=w1_q,
w2=w2_q,
topk_weights=topk_weights,
topk_ids=topk_ids,
quant_config=quant_config,
m=m,
n=n,
k=k,
e=e,
)
# Reference check: