Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ce41e45b
Commit
ce41e45b
authored
Dec 23, 2025
by
王敏
Browse files
[feat]低延迟模式采用int8 dispatch
parent
fd894e48
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
2 deletions
+5
-2
vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
...l_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+4
-1
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+1
-1
No files found.
vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
View file @
ce41e45b
...
@@ -185,7 +185,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -185,7 +185,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
a1_dtype
:
torch
.
dtype
,
a1_dtype
:
torch
.
dtype
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
expert_x
,
expert_x_scale
=
self
.
_do_quant
(
expert_x
,
a1_scale
,
a1_dtype
,
quant_config
,
expert_num_tokens
)
if
not
self
.
use_int8_dispatch
:
expert_x
,
expert_x_scale
=
self
.
_do_quant
(
expert_x
,
a1_scale
,
a1_dtype
,
quant_config
,
expert_num_tokens
)
else
:
expert_x
,
expert_x_scale
=
expert_x
expert_tokens_meta
=
mk
.
ExpertTokensMetadata
(
expert_tokens_meta
=
mk
.
ExpertTokensMetadata
(
expert_num_tokens
=
expert_num_tokens
,
expert_num_tokens_cpu
=
None
expert_num_tokens
=
expert_num_tokens
,
expert_num_tokens_cpu
=
None
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
ce41e45b
...
@@ -171,7 +171,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
...
@@ -171,7 +171,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
and
moe
.
quant_config
.
block_shape
and
moe
.
quant_config
.
block_shape
==
DEEPEP_QUANT_BLOCK_SHAPE
)
==
DEEPEP_QUANT_BLOCK_SHAPE
)
use_int8_dispatch
=
False
#
moe.quant_config.quant_dtype == torch.int8
use_int8_dispatch
=
moe
.
quant_config
.
quant_dtype
==
torch
.
int8
# Note (varun): Whether to use FP8 dispatch or not needs some
# Note (varun): Whether to use FP8 dispatch or not needs some
# profiling. Turning it off for now.
# profiling. Turning it off for now.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment