[GPTOSS][DP/EP][Marlin] Enable GPTOSS Batched DP/EP using Marlin kernels (#25997)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-16 15:53:11 -04:00
parent 2ed8b6b3d0
commit fb0571b077
12 changed files with 1174 additions and 335 deletions
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.fused_moe.config import (
    ocp_mx_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    BatchedMarlinExperts,
    MarlinExperts,
    fused_marlin_moe,
 )
@@ -797,9 +798,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
            prepare_finalize.activation_format
            == mk.FusedMoEActivationFormat.BatchedExperts
        ):
-            raise NotImplementedError(
-                "Mxfp4 does not support batched experts format for EP"
-            )
+            if self.mxfp4_backend == Mxfp4Backend.MARLIN:
+                max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
+                assert max_num_tokens_per_rank is not None
+                assert self.moe_quant_config is not None
+                return BatchedMarlinExperts(
+                    max_num_tokens=max_num_tokens_per_rank,
+                    num_dispatchers=prepare_finalize.num_dispatchers(),
+                    quant_config=self.moe_quant_config,
+                )
+            else:
+                raise NotImplementedError(
+                    "Incompatible Mxfp4 backend for EP batched experts format"
+                )
        else:
            assert self.moe_quant_config is not None
            if (