fix: Fix KTransformers hybrid inference with int8 quantization and format (#12536)

14d80648 · Atream · GitHub · ab8b83f7 · 14d80648 · 14d80648
Unverified Commit 14d80648 authored Nov 03, 2025 by Atream Committed by GitHub Nov 03, 2025
2 changed files
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -751,6 +751,7 @@ class CompressedTensorsWNA16AMXMoEMethod(CompressedTensorsMoEMethod):
            threadpool_count=self.threadpool_count,
            amx_weight_path=self.amx_weight_path,
            chunked_prefill_size=self.chunked_prefill_size,
+            amx_method=envs.SGLANG_KT_AMX_METHOD.value,
        )
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -751,12 +751,10 @@ class DeepseekV2MoE(nn.Module):
            # router_logits: (num_tokens, n_experts)
            router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
            topk_output = self.topk(hidden_states, router_logits)
-            if isinstance(
+            final_hidden_states = self.experts(hidden_states, topk_output)
+            if not _is_cuda or isinstance(
                self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod
            ):
-                topk_output.topk_weights.mul_(self.routed_scaling_factor)
-            final_hidden_states = self.experts(hidden_states, topk_output)
-            if not _is_cuda:
                final_hidden_states *= self.routed_scaling_factor
        current_stream.wait_stream(self.alt_stream)