Commit c03a553b authored by 王敏's avatar 王敏
Browse files

[feat]w8a8 高吞吐模式先量化再dispatch

parent 4fadef92
...@@ -520,6 +520,8 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod) ...@@ -520,6 +520,8 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
False) False)
return TritonOrGroupGemmExperts( return TritonOrGroupGemmExperts(
use_int8_w8a8=envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM, #use_int8_w8a8=envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM,
use_int8_w8a8=True,
per_act_token_quant=True,
fused_experts=self.w8a8_groupgemm_contiguous_forward if envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM else self.fused_moe_forward fused_experts=self.w8a8_groupgemm_contiguous_forward if envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM else self.fused_moe_forward
) )
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment