From 4685a630a293cd7c928092efd0f8c2606a770877 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 19 Feb 2026 10:56:14 -0500
Subject: [PATCH] [Model Bash][DeepSeekR1] Remove Shared Expert Clone (#34344)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 .../fused_moe/runner/default_moe_runner.py    | 22 +++++++++----------
 vllm/model_executor/models/minicpm.py         |  2 +-
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
index c0d23964c..e92f068f0 100644
--- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
+++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
@@ -240,24 +240,22 @@ class DefaultMoERunner(MoERunner):
             )
         )
 
-        hidden_states_clone: torch.Tensor | None = None
+        shared_experts_input: torch.Tensor | None = None
         if use_shared_experts_stream:
             assert self.shared_experts_stream is not None
+            assert self.moe_config.disable_inplace
 
             shared_experts_input = (
                 shared_input if shared_input is not None else hidden_states
             )
 
-            # Clone BEFORE switching streams to avoid race condition
-            # where routed_expert kernel may mutate hidden_states.
-            hidden_states_clone = shared_experts_input.clone()
-
-            # Record that the clone will be used by shared_experts_stream
-            # to avoid gc issue from deallocation of hidden_states_clone
-            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+            # Record that the shared_experts_input will be used in the
+            # shared_experts_stream to to avoid gc issue from
+            # deallocation. For more details:
+            # https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
             # NOTE: We don't need shared_output.record_stream(current_stream())
             # because we synch the streams before using shared_output.
-            hidden_states_clone.record_stream(self.shared_experts_stream)
+            shared_experts_input.record_stream(self.shared_experts_stream)
 
             # Mark sync start point for the separate shared experts
             # stream here since we want to run in parallel with the
@@ -265,7 +263,7 @@ class DefaultMoERunner(MoERunner):
             assert self.shared_experts_stream is not None
             self.shared_experts_stream.wait_stream(current_stream())
 
-        return use_shared_experts_stream, hidden_states_clone
+        return use_shared_experts_stream, shared_experts_input
 
     def ensure_dp_chunking_init(self):
         if not self.use_dp_chunking or self.batched_hidden_states is not None:
@@ -584,7 +582,7 @@ class DefaultMoERunner(MoERunner):
 
         use_chunked_impl = self.use_dp_chunking
 
-        use_shared_experts_stream, hidden_states_clone = (
+        use_shared_experts_stream, shared_experts_input = (
             self._maybe_setup_shared_experts_stream(
                 hidden_states,
                 shared_input,
@@ -726,7 +724,7 @@ class DefaultMoERunner(MoERunner):
                     with torch.cuda.stream(self.shared_experts_stream):
                         # Note that hidden_states clone() is necessary here to avoid
                         # conflict with the main stream
-                        shared_output = self.shared_experts(hidden_states_clone)
+                        shared_output = self.shared_experts(shared_experts_input)
                     current_stream().wait_stream(self.shared_experts_stream)
 
                 final_hidden_states = (
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 4217d119a..4492b5763 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -175,7 +175,7 @@ class MiniCPMMoE(nn.Module):
         )
 
         final_hidden_states = fused_experts(
-            hidden_states, self.ws, self.w2s, topk_weights, topk_ids, inplace=True
+            hidden_states, self.ws, self.w2s, topk_weights, topk_ids, inplace=False
         )
 
         if self.tp_size > 1: