[Bugfix] Fix FusedMoEModularKernel for triton backend (#28913)

Signed-off-by: Xin Yang <xyangx@amazon.com>

[Bugfix] Fix FusedMoEModularKernel for triton backend (#28913)
Signed-off-by: Xin Yang <xyangx@amazon.com>
468a8d72 · Xin Yang · GitHub · 4c23690f · 468a8d72
Unverified Commit 468a8d72 authored Nov 18, 2025 by Xin Yang Committed by GitHub Nov 19, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

vllm/model_executor/layers/quantization/mxfp4.py vllm/model_executor/layers/quantization/mxfp4.py +6 -4

No files found.
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -755,8 +755,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):

            self.w13_weight = w13_weight
            self.w2_weight = w2_weight
-            layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False)
-            layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False)
+            del layer.w13_weight
+            del layer.w2_weight
+            layer.w13_weight = w13_weight
+            layer.w2_weight = w2_weight
        else:
            raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")

@@ -1065,8 +1067,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):

            return triton_kernel_moe_forward(
                hidden_states=x,
-                w1=self.w13_weight,
-                w2=self.w2_weight,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
                gating_output=router_logits,
                topk=top_k,
                renormalize=renormalize,