From a776a48b1c753645c547b735ab647867c98a9b0c Mon Sep 17 00:00:00 2001 From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Date: Wed, 8 Apr 2026 12:23:08 -0700 Subject: [PATCH] [MoE] Move DEEP_GEMM into experts/ subdirectory (#39005) Signed-off-by: Jackmin801 Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- benchmarks/kernels/benchmark_silu_mul_fp8_quant.py | 2 +- docs/design/moe_kernel_features.md | 2 +- tests/kernels/moe/modular_kernel_tools/mk_objects.py | 8 ++++---- tests/kernels/moe/test_batched_deepgemm.py | 4 ++-- tests/kernels/moe/test_block_fp8.py | 2 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 6 ++++-- tests/kernels/moe/test_deepgemm.py | 2 +- tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py | 2 +- vllm/model_executor/layers/fused_moe/__init__.py | 10 ++++++---- .../fused_moe/{ => experts}/batched_deep_gemm_moe.py | 0 .../layers/fused_moe/{ => experts}/deep_gemm_moe.py | 0 vllm/model_executor/layers/fused_moe/oracle/fp8.py | 2 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 2 +- vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 14 files changed, 24 insertions(+), 20 deletions(-) rename vllm/model_executor/layers/fused_moe/{ => experts}/batched_deep_gemm_moe.py (100%) rename vllm/model_executor/layers/fused_moe/{ => experts}/deep_gemm_moe.py (100%) diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index 13b97b769..9fcf278f2 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -20,7 +20,7 @@ import matplotlib.pyplot as plt import numpy as np import torch -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( persistent_masked_m_silu_mul_quant, ) from vllm.triton_utils import tl, triton diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 7cf7b76d6..c9dc1292d 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -82,7 +82,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k | ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ | | triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | | triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | -| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y |
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | +| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y |
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] | | cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | | cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | | flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index a56435379..a39e03abe 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -7,14 +7,14 @@ import torch # Fused experts and PrepareFinalize imports import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe import TritonExperts -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - BatchedDeepGemmExperts, -) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts +from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( + BatchedDeepGemmExperts, +) +from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts, NaiveBatchedExperts, diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py index 20763b91d..b11098c82 100644 --- a/tests/kernels/moe/test_batched_deepgemm.py +++ b/tests/kernels/moe/test_batched_deepgemm.py @@ -5,10 +5,10 @@ import pytest import torch from vllm.model_executor.layers.fused_moe.activation import MoEActivation -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config +from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( BatchedDeepGemmExperts, ) -from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index f27fd6f34..0181fc252 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -28,7 +28,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( from vllm.model_executor.layers.fused_moe.config import ( fp8_w8a8_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( _valid_deep_gemm_shape, ) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 9dd8b13d6..6bde13e0e 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -47,10 +47,12 @@ if has_deep_ep(): from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a if has_deep_gemm(): - from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( BatchedDeepGemmExperts, ) - from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts + from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( + DeepGemmExperts, + ) requires_deep_ep = pytest.mark.skipif( not has_deep_ep(), diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index c2949391c..47700f82a 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -175,7 +175,7 @@ def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch, workspace_i mp.setenv("VLLM_USE_DEEP_GEMM", "1") _DeepGemmExperts = importlib.import_module( - "vllm.model_executor.layers.fused_moe.deep_gemm_moe" + "vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe" ).DeepGemmExperts call_counter = {"cnt": 0} diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 4a447ba7c..ed58db62d 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -7,7 +7,7 @@ import random import pytest import torch -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( persistent_masked_m_silu_mul_quant, ) from vllm.model_executor.layers.quantization.utils.quant_utils import ( diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index f56a2e63b..b342e0c6e 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -77,16 +77,18 @@ __all__ = [ if HAS_TRITON: # import to register the custom ops - from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - BatchedDeepGemmExperts, - ) from vllm.model_executor.layers.fused_moe.cutlass_moe import ( CutlassBatchedExpertsFp8, CutlassExpertsFp8, CutlassExpertsW4A8Fp8, cutlass_moe_w4a8_fp8, ) - from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts + from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( + BatchedDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( + DeepGemmExperts, + ) from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts, ) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py rename to vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/experts/deep_gemm_moe.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/deep_gemm_moe.py rename to vllm/model_executor/layers/fused_moe/experts/deep_gemm_moe.py diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 36f35ed5e..d3c70d2d0 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -131,7 +131,7 @@ def backend_to_kernel_cls( return [TritonOrDeepGemmExperts] elif backend == Fp8MoeBackend.BATCHED_DEEPGEMM: - from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( BatchedDeepGemmExperts, ) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index b601806b0..d58dbc20e 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -9,7 +9,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape, diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 07476906e..e41b2c5e1 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -11,8 +11,8 @@ from tqdm import tqdm import vllm.envs as envs from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M +from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts,