Unverified Commit 14d80648 authored by Atream's avatar Atream Committed by GitHub
Browse files

fix: Fix KTransformers hybrid inference with int8 quantization and format (#12536)

parent ab8b83f7
...@@ -751,6 +751,7 @@ class CompressedTensorsWNA16AMXMoEMethod(CompressedTensorsMoEMethod): ...@@ -751,6 +751,7 @@ class CompressedTensorsWNA16AMXMoEMethod(CompressedTensorsMoEMethod):
threadpool_count=self.threadpool_count, threadpool_count=self.threadpool_count,
amx_weight_path=self.amx_weight_path, amx_weight_path=self.amx_weight_path,
chunked_prefill_size=self.chunked_prefill_size, chunked_prefill_size=self.chunked_prefill_size,
amx_method=envs.SGLANG_KT_AMX_METHOD.value,
) )
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
......
...@@ -751,12 +751,10 @@ class DeepseekV2MoE(nn.Module): ...@@ -751,12 +751,10 @@ class DeepseekV2MoE(nn.Module):
# router_logits: (num_tokens, n_experts) # router_logits: (num_tokens, n_experts)
router_logits = self.gate(hidden_states, gemm_output_zero_allocator) router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
topk_output = self.topk(hidden_states, router_logits) topk_output = self.topk(hidden_states, router_logits)
if isinstance( final_hidden_states = self.experts(hidden_states, topk_output)
if not _is_cuda or isinstance(
self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod self.experts.quant_method, CompressedTensorsWNA16AMXEPMoEMethod
): ):
topk_output.topk_weights.mul_(self.routed_scaling_factor)
final_hidden_states = self.experts(hidden_states, topk_output)
if not _is_cuda:
final_hidden_states *= self.routed_scaling_factor final_hidden_states *= self.routed_scaling_factor
current_stream.wait_stream(self.alt_stream) current_stream.wait_stream(self.alt_stream)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment