[Kernel] Add FlashInfer MoE A2A Kernel (#36022)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Signed-off-by: Leo Tian <lctian@nvidia.com> Co-authored-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Stefano Castagnetta <scastagnetta@nvidia.com> Co-authored-by: root <root@lyris0267.lyris.clusters.nvidia.com>
This commit is contained in:
@@ -45,7 +45,9 @@ All2AllBackend = Literal[
|
||||
"mori",
|
||||
"nixl_ep",
|
||||
"allgather_reducescatter",
|
||||
"flashinfer_all2allv",
|
||||
"flashinfer_all2allv", # temporary alias for flashinfer_nvlink_two_sided
|
||||
"flashinfer_nvlink_two_sided",
|
||||
"flashinfer_nvlink_one_sided",
|
||||
]
|
||||
|
||||
|
||||
@@ -158,7 +160,8 @@ class ParallelConfig:
|
||||
- "deepep_low_latency": Use deepep low-latency kernels\n
|
||||
- "mori": Use mori kernels\n
|
||||
- "nixl_ep": Use nixl-ep kernels\n
|
||||
- "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
|
||||
- "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
|
||||
- "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""
|
||||
|
||||
max_parallel_loading_workers: int | None = None
|
||||
"""Maximum number of parallel loading workers when loading model
|
||||
|
||||
Reference in New Issue
Block a user