From 5e5baa91aa163ec3c49ed8b3a49b4dab3430a436 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Mon, 16 Jun 2025 10:58:01 -0400 Subject: [PATCH] [Kernels] Use empty for modular MoE workspaces (#19667) Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 3 +++ vllm/model_executor/layers/fused_moe/modular_kernel.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 3bbae4e57..a12cfafd4 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -716,6 +716,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2)) + if self.use_fp8_w8a8: + intermediate_cache1.fill_(0) + # MM1 invoke_moe_batched_triton_kernel(A=hidden_states, B=w1, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 9409b5998..ed3b6b8a1 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -426,10 +426,10 @@ class FusedMoEModularKernel(torch.nn.Module): # We can reuse the memory between cache1 and cache3 because by the # time we need cache3, we're done with cache1. - workspace13 = torch.zeros(prod(workspace13_shape), + workspace13 = torch.empty(prod(workspace13_shape), device=a1.device, dtype=workspace_dtype) - workspace2 = torch.zeros(prod(workspace2_shape), + workspace2 = torch.empty(prod(workspace2_shape), device=a1.device, dtype=workspace_dtype)