[MoE Refactor] Rename "naive" all2all backend (#36294)
Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
@@ -103,7 +103,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
|
||||
|
||||
## Modular Kernel "families"
|
||||
|
||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
|
||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts.
|
||||
|
||||
| backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses |
|
||||
| ------- | ---------------------------------------------- | ----------------------------------- |
|
||||
|
||||
@@ -23,7 +23,6 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to
|
||||
| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
|
||||
| `flashinfer_nvlink_one_sided` | MNNVL systems | FlashInfer's one-sided A2A strategy for multi-node NVLink | High-throughput workloads |
|
||||
| `flashinfer_nvlink_two_sided` | MNNVL systems | FlashInfer's two-sided A2A strategy for multi-node NVLink | Systems with NVLink across nodes |
|
||||
| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production |
|
||||
|
||||
## Single Node Deployment
|
||||
|
||||
|
||||
@@ -162,7 +162,6 @@ class ParallelConfig:
|
||||
all2all_backend: All2AllBackend = "allgather_reducescatter"
|
||||
"""All2All backend for MoE expert parallel communication. Available options:
|
||||
|
||||
- "naive": Naive all2all implementation using broadcasts\n
|
||||
- "allgather_reducescatter": All2all based on allgather and reducescatter\n
|
||||
- "deepep_high_throughput": Use deepep high-throughput kernels\n
|
||||
- "deepep_low_latency": Use deepep low-latency kernels\n
|
||||
@@ -344,10 +343,11 @@ class ParallelConfig:
|
||||
f"but found: {self._api_process_rank}"
|
||||
)
|
||||
|
||||
if self.all2all_backend == "pplx":
|
||||
if self.all2all_backend in ["pplx", "naive"]:
|
||||
logger.warning(
|
||||
"The 'pplx' all2all backend has been removed. "
|
||||
"Falling back to 'allgather_reducescatter'."
|
||||
"The '%s' all2all backend has been removed. "
|
||||
"Falling back to 'allgather_reducescatter'.",
|
||||
self.all2all_backend,
|
||||
)
|
||||
self.all2all_backend = "allgather_reducescatter"
|
||||
|
||||
@@ -534,7 +534,6 @@ class ParallelConfig:
|
||||
self.all2all_backend
|
||||
in (
|
||||
"allgather_reducescatter",
|
||||
"naive",
|
||||
"deepep_high_throughput",
|
||||
"deepep_low_latency",
|
||||
"mori",
|
||||
@@ -764,7 +763,7 @@ class ParallelConfig:
|
||||
)
|
||||
|
||||
if (
|
||||
self.all2all_backend in ("allgather_reducescatter", "naive")
|
||||
self.all2all_backend in ("allgather_reducescatter")
|
||||
and self.eplb_config.use_async
|
||||
):
|
||||
logger.warning(
|
||||
|
||||
@@ -229,7 +229,7 @@ def maybe_make_prepare_finalize(
|
||||
num_dispatchers=all2all_manager.world_size,
|
||||
)
|
||||
|
||||
elif moe.use_naive_all2all_kernels and allow_new_interface:
|
||||
elif moe.use_ag_rs_all2all_kernels and allow_new_interface:
|
||||
prepare_finalize = make_moe_prepare_and_finalize_naive_dp_ep(
|
||||
use_monolithic=use_monolithic,
|
||||
is_sequence_parallel=moe.moe_parallel_config.is_sequence_parallel,
|
||||
|
||||
@@ -975,9 +975,10 @@ class FusedMoEParallelConfig:
|
||||
return self.use_deepep_ll_kernels
|
||||
|
||||
@property
|
||||
def use_naive_all2all_kernels(self):
|
||||
return self.use_all2all_kernels and (
|
||||
self.all2all_backend in ["naive", "allgather_reducescatter"]
|
||||
def use_ag_rs_all2all_kernels(self):
|
||||
return (
|
||||
self.use_all2all_kernels
|
||||
and self.all2all_backend == "allgather_reducescatter"
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -1143,7 +1144,7 @@ class FusedMoEParallelConfig:
|
||||
ep_rank=0,
|
||||
sp_size=1,
|
||||
use_ep=False,
|
||||
all2all_backend="naive",
|
||||
all2all_backend="allgather_reducescatter",
|
||||
enable_eplb=False,
|
||||
)
|
||||
|
||||
@@ -1256,8 +1257,8 @@ class FusedMoEConfig:
|
||||
return self.moe_parallel_config.use_fi_nvl_one_sided_kernels
|
||||
|
||||
@property
|
||||
def use_naive_all2all_kernels(self):
|
||||
return self.moe_parallel_config.use_naive_all2all_kernels
|
||||
def use_ag_rs_all2all_kernels(self):
|
||||
return self.moe_parallel_config.use_ag_rs_all2all_kernels
|
||||
|
||||
@property
|
||||
def use_nixl_ep_kernels(self):
|
||||
|
||||
@@ -79,7 +79,7 @@ class TrtLlmFp8ExpertsBase:
|
||||
"""Monolithic kernel so only use with naive DP/EP and TP."""
|
||||
return (
|
||||
not moe_parallel_config.use_all2all_kernels
|
||||
or moe_parallel_config.use_naive_all2all_kernels
|
||||
or moe_parallel_config.use_ag_rs_all2all_kernels
|
||||
) and not moe_parallel_config.enable_eplb
|
||||
|
||||
def supports_chunking(self) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user