From 28c94770adfcb9cfbc78a3221f915bfc830c6582 Mon Sep 17 00:00:00 2001
From: roikoren755 <26850796+roikoren755@users.noreply.github.com>
Date: Tue, 6 Jan 2026 18:00:40 +0200
Subject: [PATCH] [NemotronH] Use ReplicatedLinear for fc1_latent_proj (#31807)

Signed-off-by: Roi Koren <roik@nvidia.com>
---
 vllm/model_executor/models/nemotron_h.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 8bc9ce615..c0fe39427 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -210,16 +210,12 @@ class NemotronHMoE(nn.Module):
         )
 
         if self.use_latent_moe:
-            # TODO: check if using ReplicatedLinear is better than
-            # ColumnParallelLinear + all_gather
-            self.fc1_latent_proj = ColumnParallelLinear(
+            self.fc1_latent_proj = ReplicatedLinear(
                 input_size=config.hidden_size,
                 output_size=self.moe_hidden_size,
                 bias=config.mlp_bias,
                 quant_config=quant_config,
                 disable_tp=self.is_sequence_parallel,
-                # We need to gather the output to prepare input for moe
-                gather_output=True,
                 prefix=f"{prefix}.fc1_latent_proj",
             )
             self.fc2_latent_proj = ReplicatedLinear(