Unverified Commit 2104d20e authored by Atream's avatar Atream Committed by GitHub
Browse files

Temporarily fix missing routed_scaling_factor for CompressedTensorsWNA16MoEMethod (#12738)

parent f235498e
...@@ -84,6 +84,7 @@ from sglang.srt.layers.quantization import CompressedTensorsConfig ...@@ -84,6 +84,7 @@ from sglang.srt.layers.quantization import CompressedTensorsConfig
from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.base_config import QuantizationConfig
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import ( from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
CompressedTensorsWNA16AMXEPMoEMethod, CompressedTensorsWNA16AMXEPMoEMethod,
CompressedTensorsWNA16MoEMethod,
) )
from sglang.srt.layers.quantization.fp8 import Fp8Config from sglang.srt.layers.quantization.fp8 import Fp8Config
from sglang.srt.layers.quantization.fp8_kernel import ( from sglang.srt.layers.quantization.fp8_kernel import (
...@@ -777,8 +778,14 @@ class DeepseekV2MoE(nn.Module): ...@@ -777,8 +778,14 @@ class DeepseekV2MoE(nn.Module):
router_logits = self.gate(hidden_states, gemm_output_zero_allocator) router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
topk_output = self.topk(hidden_states, router_logits) topk_output = self.topk(hidden_states, router_logits)
final_hidden_states = self.experts(hidden_states, topk_output) final_hidden_states = self.experts(hidden_states, topk_output)
if not _is_cuda or isinstance( if (
not _is_cuda
or isinstance(
self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod
)
or isinstance(
self.experts.quant_method, CompressedTensorsWNA16MoEMethod
)
): ):
final_hidden_states *= self.routed_scaling_factor final_hidden_states *= self.routed_scaling_factor
...@@ -838,7 +845,14 @@ class DeepseekV2MoE(nn.Module): ...@@ -838,7 +845,14 @@ class DeepseekV2MoE(nn.Module):
else {} else {}
), ),
) )
if not _is_cuda and not _use_aiter: if (
not _is_cuda
and not _use_aiter
or isinstance(
self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod
)
or isinstance(self.experts.quant_method, CompressedTensorsWNA16MoEMethod)
):
# fused in biased_grouped_topk so we can skip here # fused in biased_grouped_topk so we can skip here
final_hidden_states *= self.routed_scaling_factor final_hidden_states *= self.routed_scaling_factor
if shared_output is not None: if shared_output is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment