From 28c94770adfcb9cfbc78a3221f915bfc830c6582 Mon Sep 17 00:00:00 2001 From: roikoren755 <26850796+roikoren755@users.noreply.github.com> Date: Tue, 6 Jan 2026 18:00:40 +0200 Subject: [PATCH] [NemotronH] Use ReplicatedLinear for fc1_latent_proj (#31807) Signed-off-by: Roi Koren --- vllm/model_executor/models/nemotron_h.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 8bc9ce615..c0fe39427 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -210,16 +210,12 @@ class NemotronHMoE(nn.Module): ) if self.use_latent_moe: - # TODO: check if using ReplicatedLinear is better than - # ColumnParallelLinear + all_gather - self.fc1_latent_proj = ColumnParallelLinear( + self.fc1_latent_proj = ReplicatedLinear( input_size=config.hidden_size, output_size=self.moe_hidden_size, bias=config.mlp_bias, quant_config=quant_config, disable_tp=self.is_sequence_parallel, - # We need to gather the output to prepare input for moe - gather_output=True, prefix=f"{prefix}.fc1_latent_proj", ) self.fc2_latent_proj = ReplicatedLinear(