Commit a13dd086 authored by 王敏's avatar 王敏
Browse files

[feat]w8a8高吞吐模式先做量化在做dispatch

parent 62f05dde
......@@ -527,6 +527,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
False)
return TritonOrGroupGemmExperts(
use_int8_w8a8=envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM,
use_int8_w8a8=True,
per_act_token_quant=True,
fused_experts=self.w8a8_groupgemm_contiguous_forward if envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM else self.fused_moe_forward
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment