[BugFix] Add an env to disable moe chunking to work around compile incompatibility (#19642)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-22 15:17:49 -07:00
parent e91386cde1
commit 33d51f599e
2 changed files with 12 additions and 1 deletions
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -225,6 +225,10 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
        else:
            raise ValueError(f"Unsupported FusedMoe activation: {activation}")

+    def enable_chunking(self):
+        return envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and \
+          self.supports_chunking()
+
    @abstractmethod
    def apply(
        self,
@@ -400,7 +404,7 @@ class FusedMoEModularKernel(torch.nn.Module):
        else:
            _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)

-            if self.fused_experts.supports_chunking():
+            if self.fused_experts.enable_chunking():
                CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                num_chunks = cdiv(M, CHUNK_SIZE)
            else: