From 4685a630a293cd7c928092efd0f8c2606a770877 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Thu, 19 Feb 2026 10:56:14 -0500 Subject: [PATCH] [Model Bash][DeepSeekR1] Remove Shared Expert Clone (#34344) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw --- .../fused_moe/runner/default_moe_runner.py | 22 +++++++++---------- vllm/model_executor/models/minicpm.py | 2 +- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index c0d23964c..e92f068f0 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -240,24 +240,22 @@ class DefaultMoERunner(MoERunner): ) ) - hidden_states_clone: torch.Tensor | None = None + shared_experts_input: torch.Tensor | None = None if use_shared_experts_stream: assert self.shared_experts_stream is not None + assert self.moe_config.disable_inplace shared_experts_input = ( shared_input if shared_input is not None else hidden_states ) - # Clone BEFORE switching streams to avoid race condition - # where routed_expert kernel may mutate hidden_states. - hidden_states_clone = shared_experts_input.clone() - - # Record that the clone will be used by shared_experts_stream - # to avoid gc issue from deallocation of hidden_states_clone - # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 + # Record that the shared_experts_input will be used in the + # shared_experts_stream to to avoid gc issue from + # deallocation. For more details: + # https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 # NOTE: We don't need shared_output.record_stream(current_stream()) # because we synch the streams before using shared_output. - hidden_states_clone.record_stream(self.shared_experts_stream) + shared_experts_input.record_stream(self.shared_experts_stream) # Mark sync start point for the separate shared experts # stream here since we want to run in parallel with the @@ -265,7 +263,7 @@ class DefaultMoERunner(MoERunner): assert self.shared_experts_stream is not None self.shared_experts_stream.wait_stream(current_stream()) - return use_shared_experts_stream, hidden_states_clone + return use_shared_experts_stream, shared_experts_input def ensure_dp_chunking_init(self): if not self.use_dp_chunking or self.batched_hidden_states is not None: @@ -584,7 +582,7 @@ class DefaultMoERunner(MoERunner): use_chunked_impl = self.use_dp_chunking - use_shared_experts_stream, hidden_states_clone = ( + use_shared_experts_stream, shared_experts_input = ( self._maybe_setup_shared_experts_stream( hidden_states, shared_input, @@ -726,7 +724,7 @@ class DefaultMoERunner(MoERunner): with torch.cuda.stream(self.shared_experts_stream): # Note that hidden_states clone() is necessary here to avoid # conflict with the main stream - shared_output = self.shared_experts(hidden_states_clone) + shared_output = self.shared_experts(shared_experts_input) current_stream().wait_stream(self.shared_experts_stream) final_hidden_states = ( diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 4217d119a..4492b5763 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -175,7 +175,7 @@ class MiniCPMMoE(nn.Module): ) final_hidden_states = fused_experts( - hidden_states, self.ws, self.w2s, topk_weights, topk_ids, inplace=True + hidden_states, self.ws, self.w2s, topk_weights, topk_ids, inplace=False ) if self.tp_size > 1: