[GPTOSS][DP/EP][Marlin] Enable GPTOSS Batched DP/EP using Marlin kernels (#25997)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
This commit is contained in:
committed by
GitHub
parent
2ed8b6b3d0
commit
fb0571b077
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
ocp_mx_moe_quant_config,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
|
||||
BatchedMarlinExperts,
|
||||
MarlinExperts,
|
||||
fused_marlin_moe,
|
||||
)
|
||||
@@ -797,9 +798,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
prepare_finalize.activation_format
|
||||
== mk.FusedMoEActivationFormat.BatchedExperts
|
||||
):
|
||||
raise NotImplementedError(
|
||||
"Mxfp4 does not support batched experts format for EP"
|
||||
)
|
||||
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
|
||||
max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
|
||||
assert max_num_tokens_per_rank is not None
|
||||
assert self.moe_quant_config is not None
|
||||
return BatchedMarlinExperts(
|
||||
max_num_tokens=max_num_tokens_per_rank,
|
||||
num_dispatchers=prepare_finalize.num_dispatchers(),
|
||||
quant_config=self.moe_quant_config,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Incompatible Mxfp4 backend for EP batched experts format"
|
||||
)
|
||||
else:
|
||||
assert self.moe_quant_config is not None
|
||||
if (
|
||||
|
||||
Reference in New Issue
Block a user