[WideEP] Remove pplx all2all backend (#33724)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
0f2f24c8b2
commit
eb19955c37
@@ -155,5 +155,14 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
||||
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
||||
# on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
|
||||
# if it's possible to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
|
||||
14
csrc/ops.h
14
csrc/ops.h
@@ -269,13 +269,13 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
const int64_t n, const int64_t k, const bool swap_ab);
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts,
|
||||
const int64_t padded_m, const int64_t n,
|
||||
const int64_t k);
|
||||
void get_cutlass_batched_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts,
|
||||
const int64_t padded_m, const int64_t n,
|
||||
const int64_t k);
|
||||
|
||||
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
|
||||
@@ -263,12 +263,10 @@ void get_cutlass_moe_mm_data_caller(
|
||||
}
|
||||
|
||||
template <bool SWAP_AB>
|
||||
__global__ void compute_pplx_data(int32_t* expert_offsets,
|
||||
int32_t* problem_sizes1,
|
||||
int32_t* problem_sizes2,
|
||||
const int32_t* __restrict__ expert_num_tokens,
|
||||
const int padded_m, const int n,
|
||||
const int k) {
|
||||
__global__ void compute_batched_moe_data(
|
||||
int32_t* expert_offsets, int32_t* problem_sizes1, int32_t* problem_sizes2,
|
||||
const int32_t* __restrict__ expert_num_tokens, const int padded_m,
|
||||
const int n, const int k) {
|
||||
int expert_idx = threadIdx.x;
|
||||
expert_offsets[expert_idx] = expert_idx * padded_m;
|
||||
|
||||
@@ -289,24 +287,22 @@ __global__ void compute_pplx_data(int32_t* expert_offsets,
|
||||
}
|
||||
}
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts,
|
||||
const int64_t padded_m,
|
||||
const int64_t n, const int64_t k) {
|
||||
void get_cutlass_batched_moe_mm_data_caller(
|
||||
torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts, const int64_t padded_m, const int64_t n,
|
||||
const int64_t k) {
|
||||
auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
|
||||
|
||||
if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) {
|
||||
compute_pplx_data<false><<<1, num_local_experts, 0, stream>>>(
|
||||
compute_batched_moe_data<false><<<1, num_local_experts, 0, stream>>>(
|
||||
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||
static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
|
||||
k);
|
||||
} else {
|
||||
compute_pplx_data<true><<<1, num_local_experts, 0, stream>>>(
|
||||
compute_batched_moe_data<true><<<1, num_local_experts, 0, stream>>>(
|
||||
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||
|
||||
@@ -82,13 +82,11 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
const int64_t n, const int64_t k, const bool swap_ab);
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts,
|
||||
const int64_t padded_m,
|
||||
const int64_t n, const int64_t k);
|
||||
void get_cutlass_batched_moe_mm_data_caller(
|
||||
torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts, const int64_t padded_m, const int64_t n,
|
||||
const int64_t k);
|
||||
#endif
|
||||
|
||||
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||
@@ -319,29 +317,30 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts,
|
||||
const int64_t padded_m, const int64_t n,
|
||||
const int64_t k) {
|
||||
void get_cutlass_batched_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
const torch::Tensor& expert_num_tokens,
|
||||
const int64_t num_local_experts,
|
||||
const int64_t padded_m, const int64_t n,
|
||||
const int64_t k) {
|
||||
// This function currently gets compiled only if we have a valid cutlass moe
|
||||
// mm to run it for.
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
|
||||
problem_sizes2, expert_num_tokens,
|
||||
num_local_experts, padded_m, n, k);
|
||||
get_cutlass_batched_moe_mm_data_caller(expert_offsets, problem_sizes1,
|
||||
problem_sizes2, expert_num_tokens,
|
||||
num_local_experts, padded_m, n, k);
|
||||
return;
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
|
||||
"for CUDA device capability: ",
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||
"No compiled get_cutlass_batched_moe_mm_data: no "
|
||||
"cutlass_scaled_mm kernel "
|
||||
"for CUDA device capability: ",
|
||||
version_num,
|
||||
". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
||||
|
||||
@@ -489,19 +489,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
&get_cutlass_moe_mm_problem_sizes_from_expert_offsets);
|
||||
|
||||
// A function that computes data required to run fused MoE with w8a8 grouped
|
||||
// GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
|
||||
// GEMM in batched expert format. It takes expert_num_tokens
|
||||
// as an input, and computes expert_offsets (token start indices of each
|
||||
// expert). In addition to this, it computes problem sizes for each expert's
|
||||
// multiplication used by the two mms called from fused MoE operation.
|
||||
ops.def(
|
||||
"get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
|
||||
"get_cutlass_batched_moe_mm_data(Tensor! expert_offsets, "
|
||||
" Tensor! problem_sizes1, "
|
||||
" Tensor! problem_sizes2, "
|
||||
" Tensor expert_num_tokens, "
|
||||
" int num_local_experts, int padded_m, "
|
||||
" int n, int k) -> ()");
|
||||
ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
|
||||
&get_cutlass_pplx_moe_mm_data);
|
||||
ops.impl("get_cutlass_batched_moe_mm_data", torch::kCUDA,
|
||||
&get_cutlass_batched_moe_mm_data);
|
||||
|
||||
// Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
|
||||
ops.def(
|
||||
|
||||
@@ -308,7 +308,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
|
||||
#################### EXTENSIONS BUILD IMAGE ####################
|
||||
# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
|
||||
# Build DeepGEMM, DeepEP - runs in PARALLEL with csrc-build
|
||||
# This stage is independent and doesn't affect csrc cache
|
||||
FROM base AS extensions-build
|
||||
ARG CUDA_VERSION
|
||||
@@ -335,10 +335,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
|
||||
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
||||
|
||||
# Build pplx-kernels and DeepEP wheels
|
||||
# Build DeepEP wheels
|
||||
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
||||
# Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management
|
||||
ARG PPLX_COMMIT_HASH=12cecfd
|
||||
ARG DEEPEP_COMMIT_HASH=73b6ea4
|
||||
ARG NVSHMEM_VER
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@@ -347,7 +346,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
/tmp/install_python_libraries.sh \
|
||||
--workspace /tmp/ep_kernels_workspace \
|
||||
--mode wheel \
|
||||
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
|
||||
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \
|
||||
${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \
|
||||
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
|
||||
@@ -676,7 +674,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||||
|
||||
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
|
||||
# Install EP kernels wheels (DeepEP) that have been built in the `build` stage
|
||||
RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system ep_kernels/dist/*.whl --verbose \
|
||||
|
||||
@@ -52,9 +52,6 @@
|
||||
"DEEPGEMM_GIT_REF": {
|
||||
"default": "477618cd51baffca09c4b0b87e97c03fe827ef03"
|
||||
},
|
||||
"PPLX_COMMIT_HASH": {
|
||||
"default": "12cecfd"
|
||||
},
|
||||
"DEEPEP_COMMIT_HASH": {
|
||||
"default": "73b6ea4"
|
||||
},
|
||||
|
||||
@@ -15,7 +15,7 @@ Based on the format of the input activations, FusedMoE implementations are broad
|
||||
The input activation format completely depends on the All2All Dispatch being used.
|
||||
|
||||
* In the Contiguous variant, the All2All Dispatch returns the activations as a contiguous tensor of shape (M, K) along with TopK Ids and TopK weights of shape (M, num_topk). Look at `DeepEPHTPrepareAndFinalize` for an example.
|
||||
* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `PplxPrepareAndFinalize` or `DeepEPLLPrepareAndFinalize` for an example.
|
||||
* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `DeepEPLLPrepareAndFinalize` for an example.
|
||||
|
||||
The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
|
||||
|
||||
@@ -132,7 +132,6 @@ class FusedMoEModularKernel:
|
||||
|
||||
Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
|
||||
|
||||
* PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
|
||||
* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and
|
||||
* DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
|
||||
|
||||
@@ -229,7 +228,7 @@ Doing this will add the new implementation to the test suite.
|
||||
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
|
||||
|
||||
The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
|
||||
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
|
||||
with incompatible types, the script will error.
|
||||
|
||||
@@ -238,7 +237,7 @@ with incompatible types, the script will error.
|
||||
Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
|
||||
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
|
||||
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
|
||||
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||
|
||||
## FusedMoEPrepareAndFinalize Implementations
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ th {
|
||||
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
|
||||
|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
|
||||
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
|
||||
| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
|
||||
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
|
||||
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
|
||||
| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferA2APrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize.FlashInferA2APrepareAndFinalize] |
|
||||
@@ -68,7 +67,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
||||
|
||||
There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
||||
|
||||
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`.
|
||||
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `DeepEPLLPrepareAndFinalize`.
|
||||
|
||||
Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
|
||||
|
||||
@@ -110,5 +109,5 @@ The following table shows "families" of modular kernels that are intended to wor
|
||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||
|---------|-----------------------------------------|----------------------------------------------|
|
||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
|
||||
| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
|
||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||
|
||||
@@ -154,7 +154,7 @@ If you have PRs touching the area, please feel free to ping the area owner for r
|
||||
- FlashAttention: @LucasWilkinson
|
||||
- FlashInfer: @LucasWilkinson, @mgoin, @WoosukKwon
|
||||
- Blackwell Kernels: @mgoin, @yewentao256
|
||||
- DeepEP/DeepGEMM/pplx: @mgoin, @yewentao256
|
||||
- DeepEP/DeepGEMM: @mgoin, @yewentao256
|
||||
|
||||
### Integrations
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ EP is typically coupled with Data Parallelism (DP). While DP can be used indepen
|
||||
|
||||
Before using EP, you need to install the necessary dependencies. We are actively working on making this easier in the future:
|
||||
|
||||
1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
|
||||
1. **Install DeepEP**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
|
||||
2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
|
||||
3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](../../tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
|
||||
|
||||
@@ -19,7 +19,6 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to
|
||||
| Backend | Use Case | Features | Best For |
|
||||
|---------|----------|----------|----------|
|
||||
| `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration |
|
||||
| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development |
|
||||
| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios |
|
||||
| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
|
||||
| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes |
|
||||
@@ -71,12 +70,11 @@ For example, with `TP=2, DP=4` (8 GPUs total):
|
||||
The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
|
||||
|
||||
```bash
|
||||
# Single node EP deployment with pplx backend
|
||||
# Single node EP deployment
|
||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU
|
||||
--data-parallel-size 8 \ # Data parallelism across 8 processes
|
||||
--enable-expert-parallel \ # Enable expert parallelism
|
||||
--all2all-backend pplx # Use pplx communication backend
|
||||
--enable-expert-parallel # Enable expert parallelism
|
||||
```
|
||||
|
||||
## Multi-Node Deployment
|
||||
@@ -197,7 +195,6 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tensor-parallel-size 1 \ # Tensor parallelism
|
||||
--data-parallel-size 8 \ # Data parallelism
|
||||
--enable-expert-parallel \ # Enable EP
|
||||
--all2all-backend pplx \ # Use pplx communication backend
|
||||
--enable-eplb \ # Enable load balancer
|
||||
--eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
|
||||
```
|
||||
|
||||
@@ -64,7 +64,7 @@ vllm serve "$MODEL_NAME" \
|
||||
--enforce-eager \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--all2all-backend pplx \
|
||||
--all2all-backend allgather_reducescatter \
|
||||
--num-redundant-experts "$REDUNDANT_EXPERTS" \
|
||||
--trust-remote-code \
|
||||
--host "$HOST" \
|
||||
|
||||
@@ -37,7 +37,6 @@ from vllm.utils.import_utils import (
|
||||
has_deep_ep,
|
||||
has_deep_gemm,
|
||||
has_mori,
|
||||
has_pplx,
|
||||
)
|
||||
|
||||
from .mk_objects import (
|
||||
@@ -206,10 +205,6 @@ class Config:
|
||||
info = expert_info(self.fused_experts_type)
|
||||
return info.needs_deep_gemm
|
||||
|
||||
def needs_pplx(self):
|
||||
info = prepare_finalize_info(self.prepare_finalize_type)
|
||||
return info.backend == "pplx"
|
||||
|
||||
def needs_deep_ep(self):
|
||||
info = prepare_finalize_info(self.prepare_finalize_type)
|
||||
return (
|
||||
@@ -290,8 +285,6 @@ class Config:
|
||||
return False, "Needs DeepEP, but DeepEP not available."
|
||||
if self.needs_deep_gemm() and not has_deep_gemm():
|
||||
return False, "Needs DeepGEMM, but DeepGEMM not available."
|
||||
if self.needs_pplx() and not has_pplx(): # noqa: SIM103
|
||||
return False, "Needs PPLX, but PPLX not available."
|
||||
if self.needs_aiter() and not has_aiter(): # noqa: SIM103
|
||||
return False, "Needs Aiter, but Aiter not available."
|
||||
if self.needs_mori() and not has_mori(): # noqa: SIM103
|
||||
|
||||
@@ -39,7 +39,6 @@ from vllm.utils.import_utils import (
|
||||
has_deep_ep,
|
||||
has_deep_gemm,
|
||||
has_mori,
|
||||
has_pplx,
|
||||
)
|
||||
|
||||
|
||||
@@ -238,19 +237,6 @@ if has_mori():
|
||||
supports_apply_weight_on_input=False,
|
||||
)
|
||||
|
||||
if has_pplx():
|
||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
||||
PplxPrepareAndFinalize,
|
||||
)
|
||||
|
||||
register_prepare_and_finalize(
|
||||
PplxPrepareAndFinalize,
|
||||
batched_format,
|
||||
common_float_and_int_types,
|
||||
blocked_quantization_support=True,
|
||||
backend="pplx",
|
||||
)
|
||||
|
||||
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
|
||||
FlashInferCutlassMoEPrepareAndFinalize,
|
||||
|
||||
@@ -125,7 +125,7 @@ if __name__ == "__main__":
|
||||
description=(
|
||||
"Run single prepare-finalize & fused-experts combination test"
|
||||
"Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel " # noqa: E501
|
||||
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
|
||||
"--pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts"
|
||||
)
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -14,7 +14,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
|
||||
@@ -39,12 +39,12 @@ from .modular_kernel_tools.parallel_utils import (
|
||||
)
|
||||
|
||||
has_any_multi_gpu_package = (
|
||||
has_deep_ep() or has_deep_gemm() or has_pplx() or has_flashinfer_cutlass_fused_moe()
|
||||
has_deep_ep() or has_deep_gemm() or has_flashinfer_cutlass_fused_moe()
|
||||
)
|
||||
|
||||
meets_multi_gpu_requirements = pytest.mark.skipif(
|
||||
not has_any_multi_gpu_package,
|
||||
reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
|
||||
reason="Requires deep_ep or deep_gemm or flashinfer packages",
|
||||
)
|
||||
|
||||
if current_platform.is_fp8_fnuz():
|
||||
@@ -341,7 +341,7 @@ if __name__ == "__main__":
|
||||
description=(
|
||||
"Run single prepare-finalize & fused-experts combination test"
|
||||
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
|
||||
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
|
||||
"--pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts"
|
||||
)
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -1,365 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import torch_experts
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEParallelConfig,
|
||||
RoutingMethodType,
|
||||
fp8_w8a8_moe_quant_config,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExpertsFp8
|
||||
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
|
||||
from ...utils import multi_gpu_test
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
|
||||
try:
|
||||
from pplx_kernels import AllToAll
|
||||
from pplx_kernels.nvshmem import (
|
||||
nvshmem_alloc_empty_unique_id,
|
||||
nvshmem_finalize,
|
||||
nvshmem_get_unique_id,
|
||||
nvshmem_init,
|
||||
)
|
||||
|
||||
has_pplx = True
|
||||
except ImportError:
|
||||
has_pplx = False
|
||||
|
||||
requires_pplx = pytest.mark.skipif(
|
||||
not has_pplx,
|
||||
reason="Requires PPLX kernels",
|
||||
)
|
||||
|
||||
NUM_EXPERTS = [40, 64]
|
||||
TOP_KS = [6, 8]
|
||||
|
||||
|
||||
def rank_chunk(num, r, w):
|
||||
rem = num % w
|
||||
return (num // w) + (1 if r < rem else 0)
|
||||
|
||||
|
||||
def chunk_by_rank(t, r, w):
|
||||
num = t.shape[0]
|
||||
chunk = rank_chunk(num, r, w)
|
||||
rem = num % w
|
||||
if rem == 0 or r < rem:
|
||||
return t[(r * chunk) : (r + 1) * chunk].contiguous()
|
||||
else:
|
||||
long_chunks = (num // w + 1) * rem
|
||||
short_chunks = (r - rem) * chunk
|
||||
start = long_chunks + short_chunks
|
||||
return t[start : start + chunk].contiguous()
|
||||
|
||||
|
||||
def pplx_cutlass_moe(
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
a: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
w1_scale: torch.Tensor,
|
||||
w2_scale: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
a1_scale: torch.Tensor,
|
||||
out_dtype,
|
||||
per_act_token: bool,
|
||||
per_out_ch: bool,
|
||||
group_name: str | None,
|
||||
):
|
||||
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
||||
PplxPrepareAndFinalize,
|
||||
)
|
||||
|
||||
init_workspace_manager(torch.cuda.current_device())
|
||||
|
||||
assert torch.cuda.current_device() == pgi.local_rank
|
||||
|
||||
num_tokens, hidden_dim = a.shape
|
||||
intermediate_dim = w2.shape[2]
|
||||
num_experts = w1.shape[0]
|
||||
block_size = hidden_dim # TODO support more cases
|
||||
device = pgi.device
|
||||
rank = pgi.rank
|
||||
world_size = pgi.world_size
|
||||
rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
|
||||
max_num_tokens = rank_chunk(num_tokens, 0, world_size)
|
||||
topk = topk_ids.shape[1]
|
||||
|
||||
if block_size == hidden_dim:
|
||||
scale_elems = 4 # hack to circumvent pplx data format requirements
|
||||
else:
|
||||
scale_elems = (hidden_dim + block_size - 1) // block_size
|
||||
|
||||
args = dict(
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
dp_size=dp_size,
|
||||
hidden_dim=hidden_dim,
|
||||
hidden_dim_bytes=hidden_dim, # because a.dtype.itemsize == 1
|
||||
hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize,
|
||||
)
|
||||
|
||||
if group_name is None:
|
||||
ata = AllToAll.internode(**args)
|
||||
else:
|
||||
args["group_name"] = group_name
|
||||
ata = AllToAll.intranode(**args)
|
||||
|
||||
w1 = w1.to(device)
|
||||
w2 = w2.to(device)
|
||||
w1_scale = w1_scale.to(device)
|
||||
w2_scale = w2_scale.to(device)
|
||||
a1_scale = a1_scale.to(device)
|
||||
|
||||
assert num_experts % world_size == 0
|
||||
num_local_experts = cdiv(num_experts, world_size)
|
||||
num_dispatchers = pgi.world_size // dp_size
|
||||
|
||||
prepare_finalize = PplxPrepareAndFinalize(
|
||||
ata,
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_local_experts=num_local_experts,
|
||||
num_dispatchers=num_dispatchers,
|
||||
)
|
||||
|
||||
def make_moe_config() -> FusedMoEConfig:
|
||||
return FusedMoEConfig(
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
hidden_dim=hidden_dim,
|
||||
intermediate_size_per_partition=intermediate_dim,
|
||||
num_local_experts=num_local_experts,
|
||||
num_logical_experts=num_experts,
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
activation=MoEActivation.SILU,
|
||||
in_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
routing_method=RoutingMethodType.Llama4,
|
||||
)
|
||||
|
||||
experts = CutlassBatchedExpertsFp8(
|
||||
moe_config=make_moe_config(),
|
||||
quant_config=fp8_w8a8_moe_quant_config(
|
||||
per_act_token_quant=per_act_token,
|
||||
per_out_ch_quant=per_out_ch,
|
||||
w1_scale=chunk_by_rank(w1_scale, rank, world_size),
|
||||
w2_scale=chunk_by_rank(w2_scale, rank, world_size),
|
||||
a1_scale=chunk_by_rank(a1_scale, rank, world_size)
|
||||
if per_act_token
|
||||
else a1_scale[rank],
|
||||
),
|
||||
max_num_tokens=max_num_tokens,
|
||||
num_dispatchers=num_dispatchers,
|
||||
)
|
||||
|
||||
fused_cutlass_experts = FusedMoEModularKernel(
|
||||
prepare_finalize,
|
||||
experts,
|
||||
inplace=False,
|
||||
)
|
||||
|
||||
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
|
||||
chunk_topk_weight = chunk_by_rank(topk_weights, rank, world_size).to(device)
|
||||
chunk_topk_ids = (
|
||||
chunk_by_rank(topk_ids, rank, world_size).to(torch.uint32).to(device)
|
||||
)
|
||||
|
||||
out = fused_cutlass_experts(
|
||||
a_chunk,
|
||||
chunk_by_rank(w1, rank, world_size),
|
||||
chunk_by_rank(w2, rank, world_size),
|
||||
chunk_topk_weight,
|
||||
chunk_topk_ids,
|
||||
global_num_experts=num_experts,
|
||||
expert_map=None, # TODO
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
ata.destroy()
|
||||
|
||||
return out[:rank_num_tokens]
|
||||
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
|
||||
|
||||
def _pplx_moe(
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
a: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
w1_scale: torch.Tensor,
|
||||
w2_scale: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
a1_scale: torch.Tensor,
|
||||
out_dtype,
|
||||
a_full: torch.Tensor,
|
||||
w1_full: torch.Tensor,
|
||||
w2_full: torch.Tensor,
|
||||
per_act_token: bool,
|
||||
per_out_ch: bool,
|
||||
use_internode: bool,
|
||||
):
|
||||
try:
|
||||
if use_internode:
|
||||
uid = (
|
||||
nvshmem_get_unique_id()
|
||||
if pgi.rank == 0
|
||||
else nvshmem_alloc_empty_unique_id()
|
||||
)
|
||||
torch.distributed.broadcast(uid, src=0)
|
||||
nvshmem_init(uid, pgi.rank, pgi.world_size)
|
||||
else:
|
||||
group_ranks = list(range(pgi.world_size))
|
||||
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
|
||||
group_name = cpu_group.group_name
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch_output = torch_experts(
|
||||
a_full, w1_full, w2_full, topk_weights, topk_ids
|
||||
)
|
||||
pplx_output = pplx_cutlass_moe(
|
||||
pgi,
|
||||
dp_size,
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
a1_scale,
|
||||
out_dtype,
|
||||
per_act_token,
|
||||
per_out_ch,
|
||||
group_name,
|
||||
)
|
||||
|
||||
torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
|
||||
pplx_output.device
|
||||
)
|
||||
|
||||
# Uncomment if more debugging is needed
|
||||
# print("PPLX OUT:", pplx_output)
|
||||
# print("TORCH OUT:", torch_output)
|
||||
|
||||
torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
|
||||
finally:
|
||||
if use_internode:
|
||||
nvshmem_finalize()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [2, 224])
|
||||
@pytest.mark.parametrize("n", [3072])
|
||||
@pytest.mark.parametrize("k", [1536])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) # , [4, 2]])
|
||||
@pytest.mark.parametrize("use_internode", [False])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.skipif(
|
||||
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
|
||||
current_platform.get_device_capability()
|
||||
),
|
||||
reason="Grouped gemm is not supported on this GPU type.",
|
||||
)
|
||||
@requires_pplx
|
||||
def test_cutlass_moe_pplx(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
per_act_token: bool,
|
||||
per_out_ch: bool,
|
||||
world_dp_size: tuple[int, int],
|
||||
use_internode: bool,
|
||||
):
|
||||
set_random_seed(7)
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
dtype = torch.half
|
||||
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10.0
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10.0
|
||||
|
||||
n_b_scales = 2 * n if per_out_ch else 1
|
||||
k_b_scales = k if per_out_ch else 1
|
||||
|
||||
w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn)
|
||||
w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
|
||||
w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
|
||||
w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
|
||||
|
||||
for expert in range(e):
|
||||
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
|
||||
w1[expert], use_per_token_if_dynamic=per_out_ch
|
||||
)
|
||||
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
|
||||
w2[expert], use_per_token_if_dynamic=per_out_ch
|
||||
)
|
||||
|
||||
w1_d = torch.empty_like(w1)
|
||||
w2_d = torch.empty_like(w2)
|
||||
for expert in range(e):
|
||||
w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
|
||||
w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
||||
|
||||
world_size, dp_size = world_dp_size
|
||||
a_scale1 = (
|
||||
torch.randn(
|
||||
(m if per_act_token else 1, 1), device="cuda", dtype=torch.float32
|
||||
)
|
||||
/ 10.0
|
||||
)
|
||||
if not per_act_token:
|
||||
a_scale1 = a_scale1.repeat(world_size, 1)
|
||||
|
||||
parallel_launch(
|
||||
world_size,
|
||||
_pplx_moe,
|
||||
dp_size,
|
||||
a,
|
||||
w1_q,
|
||||
w2_q,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
a_scale1,
|
||||
dtype,
|
||||
a,
|
||||
w1_d,
|
||||
w2_d,
|
||||
per_act_token,
|
||||
per_out_ch,
|
||||
use_internode,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Tech
|
||||
|
||||
Here we break down the requirements in 2 steps:
|
||||
|
||||
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
|
||||
1. Build and install the Python libraries ([DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
|
||||
2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
|
||||
|
||||
Step 2 is necessary for multi-node deployment.
|
||||
|
||||
@@ -76,11 +76,4 @@ popd
|
||||
|
||||
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
|
||||
|
||||
# build and install pplx, require pytorch installed
|
||||
pushd "$WORKSPACE"
|
||||
git clone https://github.com/ppl-ai/pplx-kernels
|
||||
cd pplx-kernels
|
||||
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
|
||||
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
|
||||
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v
|
||||
|
||||
|
||||
@@ -4,12 +4,10 @@ set -ex
|
||||
# usage: ./install_python_libraries.sh [options]
|
||||
# --workspace <dir> workspace directory (default: ./ep_kernels_workspace)
|
||||
# --mode <mode> "install" (default) or "wheel"
|
||||
# --pplx-ref <commit> pplx-kernels commit hash
|
||||
# --deepep-ref <commit> DeepEP commit hash
|
||||
# --nvshmem-ver <ver> NVSHMEM version
|
||||
|
||||
CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
|
||||
PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
|
||||
DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
|
||||
NVSHMEM_VER=${NVSHMEM_VER:-"3.3.24"} # Default supports both CUDA 12 and 13
|
||||
WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
|
||||
@@ -35,14 +33,6 @@ while [[ $# -gt 0 ]]; do
|
||||
MODE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--pplx-ref)
|
||||
if [[ -z "$2" || "$2" =~ ^- ]]; then
|
||||
echo "Error: --pplx-ref requires an argument." >&2
|
||||
exit 1
|
||||
fi
|
||||
PPLX_COMMIT_HASH="$2"
|
||||
shift 2
|
||||
;;
|
||||
--deepep-ref)
|
||||
if [[ -z "$2" || "$2" =~ ^- ]]; then
|
||||
echo "Error: --deepep-ref requires an argument." >&2
|
||||
@@ -188,14 +178,6 @@ do_build() {
|
||||
popd
|
||||
}
|
||||
|
||||
# build pplx-kernels
|
||||
do_build \
|
||||
"https://github.com/ppl-ai/pplx-kernels" \
|
||||
"pplx-kernels" \
|
||||
"setup.py" \
|
||||
"$PPLX_COMMIT_HASH" \
|
||||
""
|
||||
|
||||
# build DeepEP
|
||||
do_build \
|
||||
"https://github.com/deepseek-ai/DeepEP" \
|
||||
|
||||
@@ -988,7 +988,7 @@ def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
|
||||
return output_tensor
|
||||
|
||||
|
||||
def get_cutlass_pplx_moe_mm_data(
|
||||
def get_cutlass_batched_moe_mm_data(
|
||||
expert_offsets: torch.Tensor,
|
||||
problem_sizes1: torch.Tensor,
|
||||
problem_sizes2: torch.Tensor,
|
||||
@@ -1011,7 +1011,7 @@ def get_cutlass_pplx_moe_mm_data(
|
||||
multiplication in two grouped MMs used in
|
||||
the fused MoE operation.
|
||||
"""
|
||||
return torch.ops._C.get_cutlass_pplx_moe_mm_data(
|
||||
return torch.ops._C.get_cutlass_batched_moe_mm_data(
|
||||
expert_offsets,
|
||||
problem_sizes1,
|
||||
problem_sizes2,
|
||||
|
||||
@@ -1045,7 +1045,7 @@ class CompilationConfig:
|
||||
"are optimized for prefill and are incompatible with CUDA Graphs. "
|
||||
"In order to use CUDA Graphs for decode-optimized workloads, "
|
||||
"use --all2all-backend with another option, such as "
|
||||
"deepep_low_latency, pplx, or allgather_reducescatter."
|
||||
"deepep_low_latency or allgather_reducescatter."
|
||||
)
|
||||
self.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
|
||||
@@ -152,7 +152,6 @@ class ParallelConfig:
|
||||
|
||||
- "naive": Naive all2all implementation using broadcasts\n
|
||||
- "allgather_reducescatter": All2all based on allgather and reducescatter\n
|
||||
- "pplx": Use pplx kernels\n
|
||||
- "deepep_high_throughput": Use deepep high-throughput kernels\n
|
||||
- "deepep_low_latency": Use deepep low-latency kernels\n
|
||||
- "mori": Use mori kernels\n
|
||||
@@ -310,6 +309,13 @@ class ParallelConfig:
|
||||
f"but found: {self._api_process_rank}"
|
||||
)
|
||||
|
||||
if self.all2all_backend == "pplx":
|
||||
logger.warning(
|
||||
"The 'pplx' all2all backend has been removed. "
|
||||
"Falling back to 'allgather_reducescatter'."
|
||||
)
|
||||
self.all2all_backend = "allgather_reducescatter"
|
||||
|
||||
if self.data_parallel_size_local > self.data_parallel_size:
|
||||
raise ValueError(
|
||||
f"data_parallel_size_local ({self.data_parallel_size_local}) "
|
||||
@@ -442,7 +448,6 @@ class ParallelConfig:
|
||||
# In this case, ensure the input to the experts is sequence parallel
|
||||
# to avoid the excess work.
|
||||
#
|
||||
# Not needed for pplx-kernels as it can handle duplicate input tokens.
|
||||
@property
|
||||
def use_sequence_parallel_moe(self) -> bool:
|
||||
return (
|
||||
|
||||
@@ -3,14 +3,13 @@
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed import get_dp_group, get_ep_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.flashinfer import has_flashinfer_all2all
|
||||
from vllm.utils.import_utils import has_deep_ep, has_mori, has_pplx
|
||||
from vllm.utils.import_utils import has_deep_ep, has_mori
|
||||
|
||||
from .base_device_communicator import All2AllManagerBase, Cache
|
||||
|
||||
@@ -235,96 +234,6 @@ class AgRsAll2AllManager(All2AllManagerBase):
|
||||
pass
|
||||
|
||||
|
||||
class PPLXAll2AllManager(All2AllManagerBase):
|
||||
"""
|
||||
All2All communication based on PPLX kernels.
|
||||
"""
|
||||
|
||||
def __init__(self, cpu_group):
|
||||
assert has_pplx(), (
|
||||
"pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
|
||||
" to install pplx_kernels."
|
||||
)
|
||||
super().__init__(cpu_group)
|
||||
|
||||
if self.internode:
|
||||
# inter-node communication needs nvshmem,
|
||||
# intra-node communication uses p2p mapping directly
|
||||
from pplx_kernels.nvshmem import ( # type: ignore[import-not-found]
|
||||
nvshmem_alloc_empty_unique_id,
|
||||
nvshmem_get_unique_id,
|
||||
nvshmem_init,
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
"Initialize NVSHMEM for pplx_kernels: rank=%d, world size=%d",
|
||||
self.rank,
|
||||
self.world_size,
|
||||
)
|
||||
uid = (
|
||||
nvshmem_get_unique_id()
|
||||
if self.rank == 0
|
||||
else nvshmem_alloc_empty_unique_id()
|
||||
)
|
||||
dist.broadcast(
|
||||
uid,
|
||||
src=dist.get_process_group_ranks(self.cpu_group)[0],
|
||||
group=self.cpu_group,
|
||||
)
|
||||
logger.debug("PPLX NVSHMEM UID = %s", uid)
|
||||
nvshmem_init(uid, self.rank, self.world_size)
|
||||
|
||||
self.handle_cache = Cache()
|
||||
|
||||
def get_handle(self, kwargs):
|
||||
import pplx_kernels as pplx # type: ignore[import-not-found]
|
||||
|
||||
return self.handle_cache.get_or_create(
|
||||
kwargs,
|
||||
pplx.AllToAll.internode if self.internode else pplx.AllToAll.intranode,
|
||||
)
|
||||
|
||||
def dispatch_router_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
is_sequence_parallel: bool = False,
|
||||
extra_tensors: list[torch.Tensor] | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
raise NotImplementedError
|
||||
|
||||
def dispatch(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
is_sequence_parallel: bool = False,
|
||||
extra_tensors: list[torch.Tensor] | None = None,
|
||||
) -> (
|
||||
tuple[torch.Tensor, torch.Tensor, torch.Tensor]
|
||||
| tuple[torch.Tensor, torch.Tensor, torch.Tensor, list[torch.Tensor]]
|
||||
):
|
||||
raise NotImplementedError
|
||||
|
||||
def combine(
|
||||
self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
|
||||
) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
|
||||
def destroy(self):
|
||||
with self.handle_cache._lock:
|
||||
for _, handle in self.handle_cache._cache.items():
|
||||
handle.destroy()
|
||||
|
||||
if self.internode:
|
||||
from pplx_kernels.nvshmem import (
|
||||
nvshmem_finalize, # type: ignore[import-not-found]
|
||||
)
|
||||
|
||||
logger.debug("PPLX NVSHMEM finalize")
|
||||
nvshmem_finalize()
|
||||
|
||||
|
||||
class DeepEPAll2AllManagerBase(All2AllManagerBase):
|
||||
"""
|
||||
All2All communication based on DeepEP High-Throughput kernels.
|
||||
|
||||
@@ -112,10 +112,6 @@ class CudaCommunicator(DeviceCommunicatorBase):
|
||||
from .all2all import AgRsAll2AllManager
|
||||
|
||||
self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
|
||||
elif self.all2all_backend == "pplx":
|
||||
from .all2all import PPLXAll2AllManager
|
||||
|
||||
self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
|
||||
elif self.all2all_backend == "deepep_high_throughput":
|
||||
from .all2all import DeepEPHTAll2AllManager
|
||||
|
||||
@@ -298,7 +294,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
|
||||
self.fi_ar_comm = None
|
||||
if self.all2all_manager is not None:
|
||||
self.all2all_manager.destroy()
|
||||
self.all2all_manager = None
|
||||
self.all2all_manager = None # type: ignore[assignment]
|
||||
|
||||
def all_gatherv(
|
||||
self,
|
||||
|
||||
@@ -159,7 +159,7 @@ class EplbModelState:
|
||||
|
||||
NOTE: The expert_load_view now records load for all physical experts
|
||||
rather than just local experts. This ensures consistent load statistics
|
||||
across different dispatch methods (naive all-to-all, DeepEP, pplx-kernels).
|
||||
across different dispatch methods (naive all-to-all, DeepEP).
|
||||
The recorded load will be multiplied by dp_size when using naive all-to-all
|
||||
due to each DP rank contributing the same token set to the calculation.
|
||||
See:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
@@ -24,16 +25,11 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
||||
MoEPrepareAndFinalizeNoEP,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.import_utils import has_deep_ep, has_mori, has_pplx
|
||||
from vllm.utils.import_utils import has_deep_ep, has_mori
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if current_platform.is_cuda_alike():
|
||||
if has_pplx():
|
||||
from .pplx_prepare_finalize import (
|
||||
PplxPrepareAndFinalize,
|
||||
pplx_hidden_dim_scale_bytes,
|
||||
)
|
||||
if has_deep_ep():
|
||||
from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
|
||||
from .deepep_ll_prepare_finalize import (
|
||||
@@ -120,51 +116,10 @@ def maybe_make_prepare_finalize(
|
||||
|
||||
prepare_finalize: FusedMoEPrepareAndFinalize | None = None
|
||||
|
||||
if moe.use_pplx_kernels:
|
||||
assert quant_config is not None
|
||||
|
||||
hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
|
||||
moe.max_num_tokens,
|
||||
moe.hidden_dim,
|
||||
moe.in_dtype,
|
||||
quant_config.quant_dtype,
|
||||
per_act_token_quant=quant_config.per_act_token_quant,
|
||||
block_shape=quant_config.block_shape,
|
||||
)
|
||||
|
||||
all_to_all_args = dict(
|
||||
max_num_tokens=moe.max_num_tokens,
|
||||
num_experts=moe.num_experts,
|
||||
experts_per_token=moe.experts_per_token, # topk
|
||||
rank=all2all_manager.rank,
|
||||
world_size=all2all_manager.world_size,
|
||||
# dp_size actually means tp_size, bug in pplx kernels
|
||||
dp_size=all2all_manager.tp_group.world_size,
|
||||
hidden_dim=moe.hidden_dim,
|
||||
hidden_dim_bytes=hidden_dim_bytes,
|
||||
hidden_dim_scale_bytes=hidden_scale_bytes,
|
||||
)
|
||||
|
||||
num_dispatchers = (
|
||||
all2all_manager.world_size // all2all_manager.tp_group.world_size
|
||||
)
|
||||
|
||||
# Intranode pplx a2a takes a group name while internode does not.
|
||||
if not all2all_manager.internode:
|
||||
all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name
|
||||
|
||||
handle = all2all_manager.get_handle(all_to_all_args)
|
||||
|
||||
prepare_finalize = PplxPrepareAndFinalize(
|
||||
handle,
|
||||
max_num_tokens=moe.max_num_tokens,
|
||||
num_local_experts=moe.num_local_experts,
|
||||
num_dispatchers=num_dispatchers,
|
||||
)
|
||||
elif moe.use_deepep_ht_kernels:
|
||||
if moe.use_deepep_ht_kernels:
|
||||
assert moe.dp_size == all2all_manager.dp_world_size
|
||||
|
||||
all_to_all_args = dict()
|
||||
all_to_all_args: dict[str, Any] = dict()
|
||||
handle = all2all_manager.get_handle(all_to_all_args)
|
||||
prepare_finalize = DeepEPHTPrepareAndFinalize(
|
||||
handle,
|
||||
|
||||
@@ -939,10 +939,6 @@ class FusedMoEParallelConfig:
|
||||
def use_all2all_kernels(self):
|
||||
return self.dp_size > 1 and self.use_ep
|
||||
|
||||
@property
|
||||
def use_pplx_kernels(self):
|
||||
return self.use_all2all_kernels and self.all2all_backend == "pplx"
|
||||
|
||||
@property
|
||||
def use_deepep_ht_kernels(self):
|
||||
return (
|
||||
@@ -962,7 +958,7 @@ class FusedMoEParallelConfig:
|
||||
|
||||
@property
|
||||
def use_batched_activation_format(self):
|
||||
return self.use_deepep_ll_kernels or self.use_pplx_kernels
|
||||
return self.use_deepep_ll_kernels
|
||||
|
||||
@property
|
||||
def use_naive_all2all_kernels(self):
|
||||
@@ -1221,10 +1217,6 @@ class FusedMoEConfig:
|
||||
def use_ep(self):
|
||||
return self.moe_parallel_config.use_ep
|
||||
|
||||
@property
|
||||
def use_pplx_kernels(self):
|
||||
return self.moe_parallel_config.use_pplx_kernels
|
||||
|
||||
@property
|
||||
def use_deepep_ht_kernels(self):
|
||||
return self.moe_parallel_config.use_deepep_ht_kernels
|
||||
|
||||
@@ -166,7 +166,7 @@ def run_cutlass_moe_fp8(
|
||||
problem_sizes1 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
|
||||
problem_sizes2 = torch.empty((local_E, 3), dtype=torch.int32, device=device)
|
||||
|
||||
ops.get_cutlass_pplx_moe_mm_data(
|
||||
ops.get_cutlass_batched_moe_mm_data(
|
||||
expert_offsets,
|
||||
problem_sizes1,
|
||||
problem_sizes2,
|
||||
|
||||
@@ -493,7 +493,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
"""
|
||||
A reference prepare/finalize class that reorganizes the tokens into
|
||||
expert batched format, i.e. E x max_num_tokens x K. This is the format
|
||||
that the PPLX dispatch/combine kernels use.
|
||||
that the batched dispatch/combine kernels use.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -648,7 +648,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"""
|
||||
A reference MoE expert class that operates on expert batched format,
|
||||
i.e. E x max_num_tokens x K. This is the format that the pplx
|
||||
i.e. E x max_num_tokens x K. This is the format that the batched
|
||||
dispatch/combine kernels use.
|
||||
"""
|
||||
|
||||
@@ -880,7 +880,7 @@ def batched_moe_kernel_quantize_input(
|
||||
class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"""
|
||||
A Triton based MoE expert class that operates on expert batched format,
|
||||
i.e. E x max_num_tokens x K. This is the format that the pplx
|
||||
i.e. E x max_num_tokens x K. This is the format that the batched
|
||||
dispatch/combine kernels use.
|
||||
"""
|
||||
|
||||
|
||||
@@ -1172,9 +1172,9 @@ class FusedMoEModularKernel(torch.nn.Module):
|
||||
# This happens when none of the tokens from the all2all reach this
|
||||
# EP rank. Also, note that this is only relevant for CUDAGraph
|
||||
# incompatible all2all kernels like the DeepEP high-throughput
|
||||
# kernels. CUDAGraph compatible all2all kernels like the pplx
|
||||
# kernels and the DeepEP low-latency kernels are always batched
|
||||
# and can never run into the tensor.numel() == 0 case.
|
||||
# kernels. CUDAGraph compatible all2all kernels like the DeepEP
|
||||
# low-latency kernels are always batched and can never run into
|
||||
# the tensor.numel() == 0 case.
|
||||
if M_full == 0:
|
||||
assert num_chunks == 0
|
||||
workspace13 = None
|
||||
|
||||
@@ -143,10 +143,7 @@ def select_nvfp4_moe_backend(
|
||||
# NOTE(rob): this is kind of a hack. We need to peak into
|
||||
# the prepare-finalize selection to determine if we are using
|
||||
# the batched or standard expert format.
|
||||
use_batched = (
|
||||
config.moe_parallel_config.use_deepep_ll_kernels
|
||||
or config.moe_parallel_config.use_pplx_kernels
|
||||
)
|
||||
use_batched = config.moe_parallel_config.use_deepep_ll_kernels
|
||||
activation_format = (
|
||||
mk.FusedMoEActivationFormat.BatchedExperts
|
||||
if use_batched
|
||||
|
||||
@@ -1,373 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Callable
|
||||
|
||||
import pplx_kernels as pplx
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
TopKWeightAndReduceDelegate,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.utils import (
|
||||
_validate_scale_shape,
|
||||
moe_kernel_quantize_input,
|
||||
)
|
||||
from vllm.utils.math_utils import cdiv, round_up
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def pplx_hidden_dim_scale_bytes(
|
||||
max_num_tokens: int,
|
||||
hidden_dim: int,
|
||||
in_dtype: torch.dtype,
|
||||
quant_dtype: torch.dtype | str | None,
|
||||
per_act_token_quant: bool,
|
||||
block_shape: list[int] | None,
|
||||
):
|
||||
# All pplx byte sizes must be 16-byte aligned.
|
||||
align = 16
|
||||
|
||||
# For blocked per token: set to
|
||||
# cdiv(hidden_dim, block_size) * sizeof(float32)
|
||||
# For per-token: set to 4 * sizeof(float32) (x4 for alignment)
|
||||
if quant_dtype is not None:
|
||||
assert isinstance(quant_dtype, torch.dtype)
|
||||
assert quant_dtype.itemsize == 1
|
||||
hidden_dim_bytes = hidden_dim * quant_dtype.itemsize
|
||||
elem_size = torch.float32.itemsize
|
||||
|
||||
if per_act_token_quant:
|
||||
# per-token (M x 1)
|
||||
assert block_shape is None
|
||||
hidden_scale_bytes = elem_size
|
||||
elif block_shape is not None:
|
||||
# per-group (M x K_tiles)
|
||||
block_size = block_shape[1]
|
||||
num_blocks = cdiv(hidden_dim, block_size)
|
||||
hidden_scale_bytes = num_blocks * elem_size
|
||||
else:
|
||||
# per-tensor (1 x 1)
|
||||
hidden_scale_bytes = elem_size
|
||||
else:
|
||||
hidden_dim_bytes = hidden_dim * in_dtype.itemsize
|
||||
hidden_scale_bytes = 0
|
||||
|
||||
return (
|
||||
round_up(hidden_dim_bytes, align),
|
||||
round_up(hidden_scale_bytes, align),
|
||||
)
|
||||
|
||||
|
||||
class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
"""PPLX-based prepare and finalize for expert parallelism."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
a2a: pplx.AllToAll,
|
||||
max_num_tokens: int,
|
||||
num_local_experts: int,
|
||||
num_dispatchers: int,
|
||||
):
|
||||
super().__init__()
|
||||
assert max_num_tokens > 0
|
||||
assert num_local_experts > 0
|
||||
self.a2a = a2a
|
||||
self.max_num_tokens = max_num_tokens
|
||||
self.num_local_experts = num_local_experts
|
||||
self.num_dispatchers_ = num_dispatchers
|
||||
|
||||
@property
|
||||
def activation_format(self) -> mk.FusedMoEActivationFormat:
|
||||
return mk.FusedMoEActivationFormat.BatchedExperts
|
||||
|
||||
def max_num_tokens_per_rank(self) -> int | None:
|
||||
return self.max_num_tokens
|
||||
|
||||
def topk_indices_dtype(self) -> torch.dtype | None:
|
||||
return torch.uint32
|
||||
|
||||
def num_dispatchers(self) -> int:
|
||||
return self.num_dispatchers_
|
||||
|
||||
def output_is_reduced(self) -> bool:
|
||||
return True
|
||||
|
||||
def supports_async(self) -> bool:
|
||||
return True
|
||||
|
||||
def prepare_async(
|
||||
self,
|
||||
a1: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
num_experts: int,
|
||||
expert_map: torch.Tensor | None,
|
||||
apply_router_weight_on_input: bool,
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
defer_input_quant: bool = False,
|
||||
) -> tuple[Callable, mk.ReceiverType]:
|
||||
if defer_input_quant:
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not support defer_input_quant=True. "
|
||||
"Please select an MoE kernel that accepts quantized inputs."
|
||||
)
|
||||
|
||||
num_tokens = a1.size(0) # M
|
||||
hidden_dim = a1.size(-1) # K
|
||||
|
||||
assert topk_ids.size(0) == num_tokens
|
||||
# expert_map should be None because with expert map, -1 id is used for
|
||||
# non-local token; this causes error when casting ids to the
|
||||
# topk_indices_dtype() int32
|
||||
#
|
||||
if expert_map is not None:
|
||||
logger.warning_once(
|
||||
"The PPLX backend does not support expert mapping. "
|
||||
"The provided `expert_map` will be ignored."
|
||||
)
|
||||
expert_map = None # noqa: F841
|
||||
|
||||
# Is this always going to be a1.device?
|
||||
device = a1.device
|
||||
|
||||
if apply_router_weight_on_input:
|
||||
topk = topk_ids.size(1)
|
||||
# TODO: this only works for topK=1, will need to update for topK>1
|
||||
assert topk == 1, (
|
||||
"apply_router_weight_on_input is only implemented for topk=1"
|
||||
)
|
||||
a1 = a1 * topk_weights.to(a1.dtype)
|
||||
|
||||
repeat_cols = 4
|
||||
repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0)
|
||||
# TODO(bnell): always pass quant_config.a1_scale?
|
||||
a1q, a1q_scale = moe_kernel_quantize_input(
|
||||
a1,
|
||||
(None if quant_config.per_act_token_quant else quant_config.a1_scale),
|
||||
quant_dtype=quant_config.quant_dtype,
|
||||
per_act_token_quant=quant_config.per_act_token_quant,
|
||||
block_shape=quant_config.block_shape,
|
||||
)
|
||||
|
||||
_validate_scale_shape(
|
||||
a1q, a1q_scale, quant_config.per_act_token_quant, quant_config.block_shape
|
||||
)
|
||||
|
||||
orig_a_scale_block_shape: int | None = None
|
||||
|
||||
if a1q_scale is not None:
|
||||
scalar_scales = a1q_scale.numel() == 1
|
||||
|
||||
# pplx requires 2-d scales even for scalar scales
|
||||
if a1q_scale.dim() <= 1:
|
||||
assert scalar_scales
|
||||
a1q_scale = a1q_scale.view(1, 1)
|
||||
|
||||
orig_a_scale_block_shape = a1q_scale.shape[-1]
|
||||
|
||||
if not quant_config.is_block_quantized:
|
||||
# TODO (bnell): use group_broadcast instead?
|
||||
a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
|
||||
|
||||
assert a1q_scale is None or a1q_scale.ndim == 2, (
|
||||
f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
|
||||
)
|
||||
|
||||
expert_num_tokens = torch.empty(
|
||||
self.num_local_experts,
|
||||
dtype=torch.int32,
|
||||
device=device,
|
||||
)
|
||||
|
||||
expert_x = torch.empty(
|
||||
(
|
||||
self.num_local_experts,
|
||||
self.max_num_tokens * self.num_dispatchers(),
|
||||
hidden_dim,
|
||||
),
|
||||
dtype=a1q.dtype,
|
||||
device=device,
|
||||
)
|
||||
|
||||
expert_x_scale: torch.Tensor | None = None
|
||||
if a1q.dtype.itemsize == 1:
|
||||
if quant_config.is_per_act_token:
|
||||
# (M x 1) -> (E x M x K)
|
||||
final_dim = expert_x.size(2)
|
||||
elif quant_config.is_per_tensor:
|
||||
# (1 x 1) -> (E x 1 x 1)
|
||||
final_dim = 1
|
||||
else:
|
||||
# (M x K_tiles) -> (E x M x K_tiles)
|
||||
assert quant_config.block_shape is not None
|
||||
num_blocks = cdiv(expert_x.size(2), quant_config.block_shape[1])
|
||||
final_dim = num_blocks
|
||||
|
||||
expert_x_scale_shape = (
|
||||
self.num_local_experts,
|
||||
expert_x.size(1),
|
||||
round_up(final_dim, 4), # round up for alignment
|
||||
)
|
||||
|
||||
expert_x_scale = torch.empty(
|
||||
expert_x_scale_shape,
|
||||
dtype=torch.float32,
|
||||
device=expert_x.device,
|
||||
)
|
||||
|
||||
# This argument is optional, defaults to indices.size(0)
|
||||
# There's not much point setting this unless it is != indices.size(0)
|
||||
bound_m: torch.Tensor | None = None
|
||||
|
||||
self.a2a.dispatch(
|
||||
out_expert_num_tokens=expert_num_tokens,
|
||||
out_expert_x=expert_x,
|
||||
out_expert_x_scale=expert_x_scale,
|
||||
dp_x=a1q,
|
||||
dp_x_scale=a1q_scale,
|
||||
indices=topk_ids,
|
||||
bound_m=bound_m,
|
||||
do_send=True,
|
||||
do_recv=False,
|
||||
)
|
||||
|
||||
hook = lambda: self.a2a.dispatch(
|
||||
out_expert_num_tokens=expert_num_tokens,
|
||||
out_expert_x=expert_x,
|
||||
out_expert_x_scale=expert_x_scale,
|
||||
dp_x=a1q,
|
||||
dp_x_scale=a1q_scale,
|
||||
indices=topk_ids,
|
||||
bound_m=bound_m,
|
||||
do_send=False,
|
||||
do_recv=True,
|
||||
)
|
||||
|
||||
return (
|
||||
hook,
|
||||
lambda: self._receiver(
|
||||
expert_num_tokens,
|
||||
expert_x,
|
||||
expert_x_scale,
|
||||
orig_a_scale_block_shape,
|
||||
),
|
||||
)
|
||||
|
||||
def _receiver(
|
||||
self,
|
||||
expert_num_tokens: torch.Tensor,
|
||||
expert_x: torch.Tensor,
|
||||
expert_x_scale: torch.Tensor | None,
|
||||
orig_a_scale_block_shape: int | None,
|
||||
) -> mk.PrepareResultType:
|
||||
if expert_x_scale is not None:
|
||||
expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
|
||||
assert expert_x_scale.ndim == 3
|
||||
|
||||
expert_tokens_meta = mk.ExpertTokensMetadata(
|
||||
expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None
|
||||
)
|
||||
|
||||
return expert_x, expert_x_scale, expert_tokens_meta, None, None
|
||||
|
||||
def prepare(
|
||||
self,
|
||||
a1: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
num_experts: int,
|
||||
expert_map: torch.Tensor | None,
|
||||
apply_router_weight_on_input: bool,
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
defer_input_quant: bool = False,
|
||||
) -> mk.PrepareResultType:
|
||||
hook, receiver = self.prepare_async(
|
||||
a1,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
num_experts,
|
||||
expert_map,
|
||||
apply_router_weight_on_input,
|
||||
quant_config,
|
||||
defer_input_quant=defer_input_quant,
|
||||
)
|
||||
hook()
|
||||
return receiver()
|
||||
|
||||
def finalize_async(
|
||||
self,
|
||||
output: torch.Tensor,
|
||||
fused_expert_output: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
apply_router_weight_on_input: bool,
|
||||
weight_and_reduce_impl: mk.TopKWeightAndReduce,
|
||||
) -> Callable:
|
||||
assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate), (
|
||||
"Weight application and reduction happens in the combine kernel."
|
||||
)
|
||||
|
||||
# This argument is optional
|
||||
# There's not much point setting this unless it is != topk_ids.size(0)
|
||||
bound_m: torch.Tensor | None = None
|
||||
|
||||
# TODO (bnell): fails in test_pplx_moe.py, figure out what's going on
|
||||
# num_tokens = output.size(0) # M
|
||||
# assert topk_ids.size(0) == num_tokens, (
|
||||
# f"{topk_ids.size(0)} == {num_tokens}")
|
||||
assert topk_ids.size() == topk_weights.size(), (
|
||||
f"{topk_ids.size()} == {topk_weights.size()}"
|
||||
)
|
||||
assert output.size(0) <= self.max_num_tokens, (
|
||||
f"{output.size(0)} <= {self.max_num_tokens}"
|
||||
)
|
||||
assert output.size(1) == fused_expert_output.size(-1)
|
||||
|
||||
# Set weights to 1 if we did them in dispatch. This is hacky.
|
||||
if apply_router_weight_on_input:
|
||||
topk_weights = torch.ones_like(topk_weights)
|
||||
|
||||
topk_ids_u32 = topk_ids.view(dtype=torch.uint32)
|
||||
|
||||
self.a2a.combine(
|
||||
out_tokens=output,
|
||||
indices=topk_ids_u32,
|
||||
weights=topk_weights,
|
||||
expert_y=fused_expert_output,
|
||||
bound_m=bound_m,
|
||||
do_send=True,
|
||||
do_recv=False,
|
||||
)
|
||||
|
||||
return lambda: self.a2a.combine(
|
||||
out_tokens=output,
|
||||
indices=topk_ids_u32,
|
||||
weights=topk_weights,
|
||||
expert_y=fused_expert_output,
|
||||
bound_m=bound_m,
|
||||
do_send=False,
|
||||
do_recv=True,
|
||||
)
|
||||
|
||||
def finalize(
|
||||
self,
|
||||
output: torch.Tensor,
|
||||
fused_expert_output: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
apply_router_weight_on_input: bool,
|
||||
weight_and_reduce_impl: mk.TopKWeightAndReduce,
|
||||
) -> None:
|
||||
receiver = self.finalize_async(
|
||||
output,
|
||||
fused_expert_output,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
apply_router_weight_on_input,
|
||||
weight_and_reduce_impl,
|
||||
)
|
||||
receiver()
|
||||
@@ -216,8 +216,7 @@ class DefaultMoERunner(MoERunner):
|
||||
@property
|
||||
def use_dp_chunking(self) -> bool:
|
||||
return (
|
||||
self.moe_config.moe_parallel_config.use_pplx_kernels
|
||||
or self.moe_config.moe_parallel_config.use_deepep_ll_kernels
|
||||
self.moe_config.moe_parallel_config.use_deepep_ll_kernels
|
||||
or self.moe_config.moe_parallel_config.use_mori_kernels
|
||||
or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels
|
||||
) and envs.VLLM_ENABLE_MOE_DP_CHUNK
|
||||
|
||||
@@ -14,10 +14,11 @@ class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce):
|
||||
implementation does not perform weight application and reduction
|
||||
but cannot address the needs of all the compatible PrepareAndFinalize
|
||||
implementations.
|
||||
For example, BatchedTritonExperts is compatible with both
|
||||
PplxPrepareAndFinalize and BatchedPrepareAndFinalize. PplxPrepareAndFinalize
|
||||
does the weight-application + reduction as part of the pplx combine kernel.
|
||||
But the BatchedPrepareAndFinalize needs an implementation. To facilitate
|
||||
For example, BatchedTritonExperts is compatible with both batched
|
||||
PrepareAndFinalize implementations like DeepEPLLPrepareAndFinalize and
|
||||
BatchedPrepareAndFinalize. Some PrepareAndFinalize implementations do
|
||||
the weight-application + reduction as part of the combine kernel, while
|
||||
BatchedPrepareAndFinalize needs an explicit implementation. To facilitate
|
||||
this case, the BatchedTritonExperts could use TopKWeightAndReduceDelegate
|
||||
so the PrepareAndFinalize implementations could choose how to
|
||||
weight + reduce.
|
||||
|
||||
@@ -798,7 +798,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
# batched activation format. As self.fused_experts is not
|
||||
# initialized at this point, we resort to checking the MoE config
|
||||
# directly.
|
||||
is_batched_moe = self.moe.use_pplx_kernels or self.moe.use_deepep_ll_kernels
|
||||
is_batched_moe = self.moe.use_deepep_ll_kernels
|
||||
if is_batched_moe:
|
||||
num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8
|
||||
else:
|
||||
|
||||
@@ -402,11 +402,6 @@ def _has_module(module_name: str) -> bool:
|
||||
return importlib.util.find_spec(module_name) is not None
|
||||
|
||||
|
||||
def has_pplx() -> bool:
|
||||
"""Whether the optional `pplx_kernels` package is available."""
|
||||
return _has_module("pplx_kernels")
|
||||
|
||||
|
||||
def has_deep_ep() -> bool:
|
||||
"""Whether the optional `deep_ep` package is available."""
|
||||
return _has_module("deep_ep")
|
||||
|
||||
Reference in New Issue
Block a user