diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 6045a4014..03d25a9b1 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -33,10 +33,10 @@ th { | Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass | | ------- | ------------------ | ------------ | ------------- | ----- | --------------------- | --------- | | naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] | -| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | -| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | -| flashinfer_nvlink_two_sided | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferNVLinkTwoSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize.FlashInferNVLinkTwoSidedPrepareAndFinalize] | -| flashinfer_nvlink_one_sided | standard | nvfp4 | G,A,T | N | N | [`FlashInferNVLinkOneSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize.FlashInferNVLinkOneSidedPrepareAndFinalize] | +| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht.DeepEPHTPrepareAndFinalize] | +| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll.DeepEPLLPrepareAndFinalize] | +| flashinfer_nvlink_two_sided | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferNVLinkTwoSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided.FlashInferNVLinkTwoSidedPrepareAndFinalize] | +| flashinfer_nvlink_one_sided | standard | nvfp4 | G,A,T | N | N | [`FlashInferNVLinkOneSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided.FlashInferNVLinkOneSidedPrepareAndFinalize] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index 68cf07d7c..a56435379 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -199,10 +199,10 @@ register_experts( # Disable on blackwell for now if has_deep_ep() and not current_platform.has_device_capability(100): - from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import ( DeepEPHTPrepareAndFinalize, ) - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import ( DeepEPLLPrepareAndFinalize, ) @@ -240,7 +240,7 @@ if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( FlashInferExperts, ) - from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize import ( # noqa: E501 + from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided import ( # noqa: E501 FlashInferNVLinkTwoSidedPrepareAndFinalize, ) @@ -271,7 +271,7 @@ if ( and has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100) ): - from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize import ( # noqa: E501 + from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided import ( # noqa: E501 FlashInferNVLinkOneSidedPrepareAndFinalize, ) diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py index 525e3e67b..1663e5629 100644 --- a/tests/kernels/moe/parallel_utils.py +++ b/tests/kernels/moe/parallel_utils.py @@ -19,10 +19,10 @@ from vllm.utils.import_utils import has_deep_ep from vllm.utils.network_utils import get_open_port if has_deep_ep(): - from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import ( DeepEPHTPrepareAndFinalize, ) - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import ( DeepEPLLPrepareAndFinalize, ) diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index b9404975e..9dd8b13d6 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -37,10 +37,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch from .utils import make_dummy_moe_config, make_test_weights if has_deep_ep(): - from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import ( DeepEPHTPrepareAndFinalize, ) - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import ( DeepEPLLPrepareAndFinalize, ) diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index 28bb83107..d04c3c99c 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -32,10 +32,10 @@ from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch if has_deep_ep(): - from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import ( DeepEPHTPrepareAndFinalize, ) - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import ( DeepEPLLPrepareAndFinalize, ) diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py index 74f02d03c..44c9bb79e 100644 --- a/vllm/model_executor/layers/fused_moe/all2all_utils.py +++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py @@ -15,12 +15,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEParallelConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize import ( # noqa: E501 - FlashInferNVLinkOneSidedPrepareAndFinalize, -) -from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize import ( # noqa: E501 - FlashInferNVLinkTwoSidedPrepareAndFinalize, -) from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEPrepareAndFinalize, ) @@ -28,6 +22,12 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( make_moe_prepare_and_finalize_naive_dp_ep, make_moe_prepare_and_finalize_no_dp_ep, ) +from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided import ( # noqa: E501 + FlashInferNVLinkOneSidedPrepareAndFinalize, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided import ( # noqa: E501 + FlashInferNVLinkTwoSidedPrepareAndFinalize, +) from vllm.platforms import current_platform from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep @@ -35,8 +35,8 @@ logger = init_logger(__name__) if current_platform.is_cuda_alike(): if has_deep_ep(): - from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize - from .deepep_ll_prepare_finalize import ( + from .prepare_finalize.deepep_ht import DeepEPHTPrepareAndFinalize + from .prepare_finalize.deepep_ll import ( DEEPEP_QUANT_BLOCK_SHAPE, DeepEPLLPrepareAndFinalize, ) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/__init__.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/__init__.py index 03fea7c6d..d388ee411 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize/__init__.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/__init__.py @@ -19,4 +19,7 @@ __all__ = [ "MoEPrepareAndFinalizeNoDPEPMonolithic", "MoEPrepareAndFinalizeNoDPEPModular", "make_moe_prepare_and_finalize_no_dp_ep", + # deepep_ht, deepep_ll, and flashinfer_a2a are not + # imported here as they have optional dependencies (deep_ep, flashinfer). + # Import them directly from their modules as needed. ] diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ht.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ht.py diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_nvlink_one_sided_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/flashinfer_nvlink_one_sided_prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_nvlink_two_sided_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/flashinfer_nvlink_two_sided_prepare_finalize.py rename to vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py