[ROCm] Cast score correction bias tensor during model construction for DeepSeek/Kimi-K2 (#39999)

Signed-off-by: Hemanth Acharya <heachary@amd.com>

[ROCm] Cast score correction bias tensor during model construction for DeepSeek/Kimi-K2 (#39999)
Signed-off-by: Hemanth Acharya <heachary@amd.com>
fa4b7055 · Hemanth Acharya · GitHub · 447c372a · fa4b7055 · fa4b7055
Unverified Commit fa4b7055 authored Apr 24, 2026 by Hemanth Acharya Committed by GitHub Apr 24, 2026
4 changed files
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -1782,6 +1782,8 @@ class rocm_aiter_ops:
        need_renorm: bool,
        routed_scaling_factor: float = 1.0,
    ) -> None:
+        if correction_bias.dtype != gating_output.dtype:
+            correction_bias = correction_bias.to(gating_output.dtype)
        torch.ops.vllm.rocm_aiter_biased_grouped_topk(
            gating_output,
            correction_bias,

--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -152,7 +152,7 @@ def rocm_aiter_grouped_topk(
    if e_score_correction_bias is not None:
        rocm_aiter_ops.biased_grouped_topk(
            gating_output,
-            e_score_correction_bias.to(gating_output.dtype),
+            e_score_correction_bias,
            topk_weights,
            topk_ids,
            num_expert_group,

--- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
+++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
@@ -136,7 +136,7 @@ def fused_topk_bias(
            )
            rocm_aiter_ops.biased_grouped_topk(
                gating_output,
-                e_score_correction_bias.to(gating_output.dtype),
+                e_score_correction_bias,
                topk_weights,
                topk_ids,
                num_expert_group=num_expert_group,

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -349,6 +349,21 @@ class DeepseekV2MoE(nn.Module):
            else torch.bfloat16
        )
+        # Pre-cast the bias to match the gate output dtype so the
+        # conversion is not repeated on every forward pass.  All
+        # downstream references (FusedMoE, router) share the same
+        # nn.Parameter object, so mutating .data propagates everywhere.
+        # Weight loading uses copy_(), which handles the dtype conversion.
+        # Only needed on ROCm where the aiter biased_grouped_topk kernel
+        # requires the bias dtype to match the gating output dtype.
+        if (
+            self.is_rocm_aiter_moe_enabled
+            and self.gate.e_score_correction_bias is not None
+        ):
+            self.gate.e_score_correction_bias.data = (
+                self.gate.e_score_correction_bias.data.to(self.gate.out_dtype)
+            )
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        num_tokens, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)