[Kernel] Add FlashInfer MoE A2A Kernel (#36022)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Signed-off-by: Leo Tian <lctian@nvidia.com>
Co-authored-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Stefano Castagnetta <scastagnetta@nvidia.com>
Co-authored-by: root <root@lyris0267.lyris.clusters.nvidia.com>
This commit is contained in:
leo-cf-tian
2026-03-16 02:45:32 -04:00
committed by GitHub
parent 2390d44209
commit 2754231ba3
19 changed files with 417 additions and 43 deletions

View File

@@ -33,7 +33,10 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
)
from vllm.platforms import current_platform
from vllm.utils.deep_gemm import is_deep_gemm_supported
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.flashinfer import (
has_flashinfer_cutlass_fused_moe,
has_flashinfer_nvlink_one_sided,
)
from vllm.utils.import_utils import (
has_aiter,
has_deep_ep,
@@ -234,15 +237,15 @@ if has_mori():
)
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
FlashInferA2APrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize import ( # noqa: E501
FlashInferNVLinkTwoSidedPrepareAndFinalize,
)
register_prepare_and_finalize(
FlashInferA2APrepareAndFinalize,
FlashInferNVLinkTwoSidedPrepareAndFinalize,
standard_format,
nvfp4_types + fp8_types,
blocked_quantization_support=True,
@@ -263,6 +266,36 @@ else:
FlashInferCutlassMoEPrepareAndFinalize = None
FlashInferExperts = None
if (
has_flashinfer_nvlink_one_sided()
and has_flashinfer_cutlass_fused_moe()
and current_platform.has_device_capability(100)
):
from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize import ( # noqa: E501
FlashInferNVLinkOneSidedPrepareAndFinalize,
)
register_prepare_and_finalize(
FlashInferNVLinkOneSidedPrepareAndFinalize,
standard_format,
nvfp4_types,
blocked_quantization_support=False,
backend="flashinfer_nvlink_one_sided",
supports_apply_weight_on_input=False,
)
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe import (
TrtLlmNvFp4ExpertsModular,
)
register_experts(
TrtLlmNvFp4ExpertsModular,
standard_format,
nvfp4_types,
blocked_quantization_support=False,
supports_expert_map=True,
)
if has_aiter():
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (