Temporarily fix missing routed_scaling_factor for CompressedTensorsWNA16MoEMethod (#12738)

2104d20e · Atream · GitHub · f235498e · 2104d20e
Unverified Commit 2104d20e authored Nov 06, 2025 by Atream Committed by GitHub Nov 06, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

python/sglang/srt/models/deepseek_v2.py python/sglang/srt/models/deepseek_v2.py +17 -3

No files found.
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -84,6 +84,7 @@ from sglang.srt.layers.quantization import CompressedTensorsConfig
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
    CompressedTensorsWNA16AMXEPMoEMethod,
+    CompressedTensorsWNA16MoEMethod,
 )
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.fp8_kernel import (
@@ -777,8 +778,14 @@ class DeepseekV2MoE(nn.Module):
            router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
            topk_output = self.topk(hidden_states, router_logits)
            final_hidden_states = self.experts(hidden_states, topk_output)
-            if not _is_cuda or isinstance(
-                self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod
+            if (
+                not _is_cuda
+                or isinstance(
+                    self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod
+                )
+                or isinstance(
+                    self.experts.quant_method, CompressedTensorsWNA16MoEMethod
+                )
            ):
                final_hidden_states *= self.routed_scaling_factor

@@ -838,7 +845,14 @@ class DeepseekV2MoE(nn.Module):
                else {}
            ),
        )
-        if not _is_cuda and not _use_aiter:
+        if (
+            not _is_cuda
+            and not _use_aiter
+            or isinstance(
+                self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod
+            )
+            or isinstance(self.experts.quant_method, CompressedTensorsWNA16MoEMethod)
+        ):
            # fused in biased_grouped_topk so we can skip here
            final_hidden_states *= self.routed_scaling_factor
        if shared_output is not None: