[NemotronH] Use ReplicatedLinear for fc1_latent_proj (#31807)

Signed-off-by: Roi Koren <roik@nvidia.com>

[NemotronH] Use ReplicatedLinear for fc1_latent_proj (#31807)
Signed-off-by: Roi Koren <roik@nvidia.com>
28c94770 · roikoren755 · GitHub · af8fd730 · 28c94770
Unverified Commit 28c94770 authored Jan 06, 2026 by roikoren755 Committed by GitHub Jan 06, 2026
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 5 deletions

vllm/model_executor/models/nemotron_h.py vllm/model_executor/models/nemotron_h.py +1 -5

No files found.
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -210,16 +210,12 @@ class NemotronHMoE(nn.Module):
        )

        if self.use_latent_moe:
-            # TODO: check if using ReplicatedLinear is better than
-            # ColumnParallelLinear + all_gather
-            self.fc1_latent_proj = ColumnParallelLinear(
+            self.fc1_latent_proj = ReplicatedLinear(
                input_size=config.hidden_size,
                output_size=self.moe_hidden_size,
                bias=config.mlp_bias,
                quant_config=quant_config,
                disable_tp=self.is_sequence_parallel,
-                # We need to gather the output to prepare input for moe
-                gather_output=True,
                prefix=f"{prefix}.fc1_latent_proj",
            )
            self.fc2_latent_proj = ReplicatedLinear(