Commit b91ae72f authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds-wm-1223' into 'v0.9.2-dev-ds'

[feat]低延迟模式采用int8 dispatch

See merge request dcutoolkit/deeplearing/vllm!314
parents fd894e48 ce41e45b
...@@ -185,7 +185,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ...@@ -185,7 +185,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
a1_dtype: torch.dtype, a1_dtype: torch.dtype,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
) -> mk.PrepareResultType: ) -> mk.PrepareResultType:
if not self.use_int8_dispatch:
expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a1_dtype, quant_config, expert_num_tokens) expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a1_dtype, quant_config, expert_num_tokens)
else:
expert_x, expert_x_scale = expert_x
expert_tokens_meta = mk.ExpertTokensMetadata( expert_tokens_meta = mk.ExpertTokensMetadata(
expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None
......
...@@ -171,7 +171,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): ...@@ -171,7 +171,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
and moe.quant_config.block_shape and moe.quant_config.block_shape
== DEEPEP_QUANT_BLOCK_SHAPE) == DEEPEP_QUANT_BLOCK_SHAPE)
use_int8_dispatch = False#moe.quant_config.quant_dtype == torch.int8 use_int8_dispatch = moe.quant_config.quant_dtype == torch.int8
# Note (varun): Whether to use FP8 dispatch or not needs some # Note (varun): Whether to use FP8 dispatch or not needs some
# profiling. Turning it off for now. # profiling. Turning it off for now.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment