From 5e5baa91aa163ec3c49ed8b3a49b4dab3430a436 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Mon, 16 Jun 2025 10:58:01 -0400
Subject: [PATCH] [Kernels] Use empty for modular MoE workspaces (#19667)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 3 +++
 vllm/model_executor/layers/fused_moe/modular_kernel.py    | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 3bbae4e57..a12cfafd4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -716,6 +716,9 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         intermediate_cache2 = _resize_cache(workspace2,
                                             (E, max_num_tokens, N // 2))
 
+        if self.use_fp8_w8a8:
+            intermediate_cache1.fill_(0)
+
         # MM1
         invoke_moe_batched_triton_kernel(A=hidden_states,
                                          B=w1,
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 9409b5998..ed3b6b8a1 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -426,10 +426,10 @@ class FusedMoEModularKernel(torch.nn.Module):
 
             # We can reuse the memory between cache1 and cache3 because by the
             # time we need cache3, we're done with cache1.
-            workspace13 = torch.zeros(prod(workspace13_shape),
+            workspace13 = torch.empty(prod(workspace13_shape),
                                       device=a1.device,
                                       dtype=workspace_dtype)
-            workspace2 = torch.zeros(prod(workspace2_shape),
+            workspace2 = torch.empty(prod(workspace2_shape),
                                      device=a1.device,
                                      dtype=workspace_dtype)