[Feat][Perf] Enable deepep-low-latency with round-robin expert placement. (#28449)
Signed-off-by: bruceszchen <bruceszchen@tencent.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -1018,7 +1018,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
del layer.w13_input_scale
|
||||
del layer.w2_input_scale
|
||||
|
||||
def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
|
||||
def maybe_make_prepare_finalize(
|
||||
self,
|
||||
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
|
||||
) -> mk.FusedMoEPrepareAndFinalize | None:
|
||||
if (
|
||||
self.rocm_aiter_moe_enabled
|
||||
or self.use_marlin
|
||||
@@ -1039,7 +1042,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
logger.debug_once("%s", prepare_finalize.__class__.__name__)
|
||||
return prepare_finalize
|
||||
else:
|
||||
return super().maybe_make_prepare_finalize()
|
||||
return super().maybe_make_prepare_finalize(routing_tables)
|
||||
|
||||
def select_gemm_impl(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user