Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3b30e615
Unverified
Commit
3b30e615
authored
Feb 16, 2026
by
roikoren755
Committed by
GitHub
Feb 16, 2026
Browse files
[NemotronH] Do not force router to run in fp32 (#34582)
Signed-off-by:
Roi Koren
<
roik@nvidia.com
>
parent
824f9e8f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
4 deletions
+5
-4
vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
.../model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+4
-0
vllm/model_executor/models/nemotron_h.py
vllm/model_executor/models/nemotron_h.py
+1
-4
No files found.
vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
View file @
3b30e615
...
@@ -309,6 +309,10 @@ def fi_trtllm_fp8_per_tensor_moe(
...
@@ -309,6 +309,10 @@ def fi_trtllm_fp8_per_tensor_moe(
from
vllm.utils.flashinfer
import
flashinfer_trtllm_fp8_per_tensor_scale_moe
from
vllm.utils.flashinfer
import
flashinfer_trtllm_fp8_per_tensor_scale_moe
# The DeepSeekV3 routing method requires float32 router logits.
if
routing_method_type
==
RoutingMethodType
.
DeepSeekV3
:
routing_logits
=
routing_logits
.
to
(
torch
.
float32
)
return
flashinfer_trtllm_fp8_per_tensor_scale_moe
(
return
flashinfer_trtllm_fp8_per_tensor_scale_moe
(
routing_logits
=
routing_logits
,
routing_logits
=
routing_logits
,
routing_bias
=
routing_bias
,
routing_bias
=
routing_bias
,
...
...
vllm/model_executor/models/nemotron_h.py
View file @
3b30e615
...
@@ -148,12 +148,10 @@ class NemotronHMoE(nn.Module):
...
@@ -148,12 +148,10 @@ class NemotronHMoE(nn.Module):
self
.
is_sequence_parallel
=
parallel_config
.
use_sequence_parallel_moe
self
.
is_sequence_parallel
=
parallel_config
.
use_sequence_parallel_moe
router_logits_dtype
=
torch
.
float32
self
.
gate
=
ReplicatedLinear
(
self
.
gate
=
ReplicatedLinear
(
config
.
hidden_size
,
config
.
hidden_size
,
config
.
n_routed_experts
,
config
.
n_routed_experts
,
bias
=
False
,
bias
=
False
,
params_dtype
=
router_logits_dtype
,
quant_config
=
None
,
quant_config
=
None
,
prefix
=
f
"
{
prefix
}
.gate"
,
prefix
=
f
"
{
prefix
}
.gate"
,
)
)
...
@@ -232,7 +230,6 @@ class NemotronHMoE(nn.Module):
...
@@ -232,7 +230,6 @@ class NemotronHMoE(nn.Module):
enable_eplb
=
self
.
enable_eplb
,
enable_eplb
=
self
.
enable_eplb
,
num_redundant_experts
=
self
.
n_redundant_experts
,
num_redundant_experts
=
self
.
n_redundant_experts
,
is_sequence_parallel
=
self
.
is_sequence_parallel
,
is_sequence_parallel
=
self
.
is_sequence_parallel
,
router_logits_dtype
=
router_logits_dtype
,
routed_input_transform
=
self
.
fc1_latent_proj
,
routed_input_transform
=
self
.
fc1_latent_proj
,
)
)
...
@@ -244,7 +241,7 @@ class NemotronHMoE(nn.Module):
...
@@ -244,7 +241,7 @@ class NemotronHMoE(nn.Module):
hidden_states
=
sequence_parallel_chunk
(
hidden_states
)
hidden_states
=
sequence_parallel_chunk
(
hidden_states
)
# router_logits: (num_tokens, n_experts)
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
.
to
(
dtype
=
torch
.
float32
)
)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
# SharedFusedMoE handles:
# SharedFusedMoE handles:
# - shared experts (with original hidden_states)
# - shared experts (with original hidden_states)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment