Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
2104d20e
"vscode:/vscode.git/clone" did not exist on "cd18cb3bb50ea07332ad034ada7a799d4f697772"
Unverified
Commit
2104d20e
authored
Nov 06, 2025
by
Atream
Committed by
GitHub
Nov 06, 2025
Browse files
Temporarily fix missing routed_scaling_factor for CompressedTensorsWNA16MoEMethod (#12738)
parent
f235498e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
17 additions
and
3 deletions
+17
-3
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+17
-3
No files found.
python/sglang/srt/models/deepseek_v2.py
View file @
2104d20e
...
...
@@ -84,6 +84,7 @@ from sglang.srt.layers.quantization import CompressedTensorsConfig
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe
import
(
CompressedTensorsWNA16AMXEPMoEMethod
,
CompressedTensorsWNA16MoEMethod
,
)
from
sglang.srt.layers.quantization.fp8
import
Fp8Config
from
sglang.srt.layers.quantization.fp8_kernel
import
(
...
...
@@ -777,8 +778,14 @@ class DeepseekV2MoE(nn.Module):
router_logits
=
self
.
gate
(
hidden_states
,
gemm_output_zero_allocator
)
topk_output
=
self
.
topk
(
hidden_states
,
router_logits
)
final_hidden_states
=
self
.
experts
(
hidden_states
,
topk_output
)
if
not
_is_cuda
or
isinstance
(
if
(
not
_is_cuda
or
isinstance
(
self
.
experts
.
quant_method
,
CompressedTensorsWNA16AMXEPMoEMethod
)
or
isinstance
(
self
.
experts
.
quant_method
,
CompressedTensorsWNA16MoEMethod
)
):
final_hidden_states
*=
self
.
routed_scaling_factor
...
...
@@ -838,7 +845,14 @@ class DeepseekV2MoE(nn.Module):
else
{}
),
)
if
not
_is_cuda
and
not
_use_aiter
:
if
(
not
_is_cuda
and
not
_use_aiter
or
isinstance
(
self
.
experts
.
quant_method
,
CompressedTensorsWNA16AMXEPMoEMethod
)
or
isinstance
(
self
.
experts
.
quant_method
,
CompressedTensorsWNA16MoEMethod
)
):
# fused in biased_grouped_topk so we can skip here
final_hidden_states
*=
self
.
routed_scaling_factor
if
shared_output
is
not
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment