diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index ea8956e20..3d2e02e9d 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -103,7 +103,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k ## Modular Kernel "families" -The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts. +The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. | backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses | | ------- | ---------------------------------------------- | ----------------------------------- | diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 3b13872a2..d75ae7feb 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -23,7 +23,6 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to | `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios | | `flashinfer_nvlink_one_sided` | MNNVL systems | FlashInfer's one-sided A2A strategy for multi-node NVLink | High-throughput workloads | | `flashinfer_nvlink_two_sided` | MNNVL systems | FlashInfer's two-sided A2A strategy for multi-node NVLink | Systems with NVLink across nodes | -| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production | ## Single Node Deployment diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index add011ca4..dd0d7b9cc 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -162,7 +162,6 @@ class ParallelConfig: all2all_backend: All2AllBackend = "allgather_reducescatter" """All2All backend for MoE expert parallel communication. Available options: - - "naive": Naive all2all implementation using broadcasts\n - "allgather_reducescatter": All2all based on allgather and reducescatter\n - "deepep_high_throughput": Use deepep high-throughput kernels\n - "deepep_low_latency": Use deepep low-latency kernels\n @@ -344,10 +343,11 @@ class ParallelConfig: f"but found: {self._api_process_rank}" ) - if self.all2all_backend == "pplx": + if self.all2all_backend in ["pplx", "naive"]: logger.warning( - "The 'pplx' all2all backend has been removed. " - "Falling back to 'allgather_reducescatter'." + "The '%s' all2all backend has been removed. " + "Falling back to 'allgather_reducescatter'.", + self.all2all_backend, ) self.all2all_backend = "allgather_reducescatter" @@ -534,7 +534,6 @@ class ParallelConfig: self.all2all_backend in ( "allgather_reducescatter", - "naive", "deepep_high_throughput", "deepep_low_latency", "mori", @@ -764,7 +763,7 @@ class ParallelConfig: ) if ( - self.all2all_backend in ("allgather_reducescatter", "naive") + self.all2all_backend in ("allgather_reducescatter") and self.eplb_config.use_async ): logger.warning( diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py index 4498a8a93..74f02d03c 100644 --- a/vllm/model_executor/layers/fused_moe/all2all_utils.py +++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py @@ -229,7 +229,7 @@ def maybe_make_prepare_finalize( num_dispatchers=all2all_manager.world_size, ) - elif moe.use_naive_all2all_kernels and allow_new_interface: + elif moe.use_ag_rs_all2all_kernels and allow_new_interface: prepare_finalize = make_moe_prepare_and_finalize_naive_dp_ep( use_monolithic=use_monolithic, is_sequence_parallel=moe.moe_parallel_config.is_sequence_parallel, diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 2500387de..2eb0f4921 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -975,9 +975,10 @@ class FusedMoEParallelConfig: return self.use_deepep_ll_kernels @property - def use_naive_all2all_kernels(self): - return self.use_all2all_kernels and ( - self.all2all_backend in ["naive", "allgather_reducescatter"] + def use_ag_rs_all2all_kernels(self): + return ( + self.use_all2all_kernels + and self.all2all_backend == "allgather_reducescatter" ) @property @@ -1143,7 +1144,7 @@ class FusedMoEParallelConfig: ep_rank=0, sp_size=1, use_ep=False, - all2all_backend="naive", + all2all_backend="allgather_reducescatter", enable_eplb=False, ) @@ -1256,8 +1257,8 @@ class FusedMoEConfig: return self.moe_parallel_config.use_fi_nvl_one_sided_kernels @property - def use_naive_all2all_kernels(self): - return self.moe_parallel_config.use_naive_all2all_kernels + def use_ag_rs_all2all_kernels(self): + return self.moe_parallel_config.use_ag_rs_all2all_kernels @property def use_nixl_ep_kernels(self): diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index 5f4607657..501c10ab0 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -79,7 +79,7 @@ class TrtLlmFp8ExpertsBase: """Monolithic kernel so only use with naive DP/EP and TP.""" return ( not moe_parallel_config.use_all2all_kernels - or moe_parallel_config.use_naive_all2all_kernels + or moe_parallel_config.use_ag_rs_all2all_kernels ) and not moe_parallel_config.enable_eplb def supports_chunking(self) -> bool: