Commit 428f3245 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds-wm-1217' into 'v0.9.2-dev-ds'

w8a8 高吞吐模式先量化再dispatch

See merge request dcutoolkit/deeplearing/vllm!303
parents d5b6456a 46be3c09
......@@ -520,6 +520,8 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
False)
return TritonOrGroupGemmExperts(
use_int8_w8a8=envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM,
#use_int8_w8a8=envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM,
use_int8_w8a8=True,
per_act_token_quant=True,
fused_experts=self.w8a8_groupgemm_contiguous_forward if envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM else self.fused_moe_forward
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment