[Kernel] Add FlashInfer MoE A2A Kernel (#36022)

Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Signed-off-by: Leo Tian <lctian@nvidia.com>
Co-authored-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Stefano Castagnetta <scastagnetta@nvidia.com>
Co-authored-by: root <root@lyris0267.lyris.clusters.nvidia.com>
This commit is contained in:
leo-cf-tian
2026-03-16 02:45:32 -04:00
committed by GitHub
parent 2390d44209
commit 2754231ba3
19 changed files with 417 additions and 43 deletions

View File

@@ -45,7 +45,9 @@ All2AllBackend = Literal[
"mori",
"nixl_ep",
"allgather_reducescatter",
"flashinfer_all2allv",
"flashinfer_all2allv", # temporary alias for flashinfer_nvlink_two_sided
"flashinfer_nvlink_two_sided",
"flashinfer_nvlink_one_sided",
]
@@ -158,7 +160,8 @@ class ParallelConfig:
- "deepep_low_latency": Use deepep low-latency kernels\n
- "mori": Use mori kernels\n
- "nixl_ep": Use nixl-ep kernels\n
- "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
- "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
- "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""
max_parallel_loading_workers: int | None = None
"""Maximum number of parallel loading workers when loading model