[Feat][Perf] Enable deepep-low-latency with round-robin expert placement. (#28449)

Signed-off-by: bruceszchen <bruceszchen@tencent.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-11-19 20:46:24 +08:00
parent ba558c029a
commit da2f6800e0
8 changed files with 208 additions and 37 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1018,7 +1018,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            del layer.w13_input_scale
            del layer.w2_input_scale

-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
        if (
            self.rocm_aiter_moe_enabled
            or self.use_marlin
@@ -1039,7 +1042,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            logger.debug_once("%s", prepare_finalize.__class__.__name__)
            return prepare_finalize
        else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)

    def select_gemm_impl(
        self,