[Feat][Perf] Enable deepep-low-latency with round-robin expert placement. (#28449)

Signed-off-by: bruceszchen <bruceszchen@tencent.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Chen Bruce
2025-11-19 20:46:24 +08:00
committed by GitHub
parent ba558c029a
commit da2f6800e0
8 changed files with 208 additions and 37 deletions

View File

@@ -1018,7 +1018,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
del layer.w13_input_scale
del layer.w2_input_scale
def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
def maybe_make_prepare_finalize(
self,
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> mk.FusedMoEPrepareAndFinalize | None:
if (
self.rocm_aiter_moe_enabled
or self.use_marlin
@@ -1039,7 +1042,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
logger.debug_once("%s", prepare_finalize.__class__.__name__)
return prepare_finalize
else:
return super().maybe_make_prepare_finalize()
return super().maybe_make_prepare_finalize(routing_tables)
def select_gemm_impl(
self,