[GPTOSS][DP/EP][Marlin] Enable GPTOSS Batched DP/EP using Marlin kernels (#25997)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
This commit is contained in:
Varun Sundar Rabindranath
2025-10-16 15:53:11 -04:00
committed by GitHub
parent 2ed8b6b3d0
commit fb0571b077
12 changed files with 1174 additions and 335 deletions

View File

@@ -22,6 +22,7 @@ from vllm.model_executor.layers.fused_moe.config import (
ocp_mx_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
BatchedMarlinExperts,
MarlinExperts,
fused_marlin_moe,
)
@@ -797,9 +798,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
prepare_finalize.activation_format
== mk.FusedMoEActivationFormat.BatchedExperts
):
raise NotImplementedError(
"Mxfp4 does not support batched experts format for EP"
)
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
assert max_num_tokens_per_rank is not None
assert self.moe_quant_config is not None
return BatchedMarlinExperts(
max_num_tokens=max_num_tokens_per_rank,
num_dispatchers=prepare_finalize.num_dispatchers(),
quant_config=self.moe_quant_config,
)
else:
raise NotImplementedError(
"Incompatible Mxfp4 backend for EP batched experts format"
)
else:
assert self.moe_quant_config is not None
if (