Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2eb579dd
Commit
2eb579dd
authored
Oct 12, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev-ds' of
http://10.16.6.30/dcutoolkit/deeplearing/vllm
into v0.9.2-dev-ds
parents
7b8a9e18
0e92caa0
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
47 additions
and
41 deletions
+47
-41
vllm/attention/layer.py
vllm/attention/layer.py
+2
-2
vllm/config.py
vllm/config.py
+1
-1
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+1
-0
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
+35
-29
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+8
-9
No files found.
vllm/attention/layer.py
View file @
2eb579dd
...
...
@@ -81,7 +81,7 @@ class Attention(nn.Module):
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
else
:
kv_cache_dtype
=
"auto"
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
is_attention_free
=
False
calculate_kv_scales
=
False
if
num_kv_heads
is
None
:
...
...
@@ -312,7 +312,7 @@ class MultiHeadAttention(nn.Module):
attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
kv_cache_dtype
=
None
,
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
,
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
,
is_attention_free
=
False
)
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
current_platform
.
is_rocm
():
...
...
vllm/config.py
View file @
2eb579dd
...
...
@@ -1499,7 +1499,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
class
CacheConfig
:
"""Configuration for the KV cache."""
block_size
:
BlockSize
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
# type: ignore
block_size
:
BlockSize
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
# type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
...
...
vllm/distributed/parallel_state.py
View file @
2eb579dd
...
...
@@ -949,6 +949,7 @@ def init_distributed_environment(
backend
=
"gloo"
# this backend is used for WORLD
parallel_config
=
config
.
parallel_config
data_parallel_size
=
parallel_config
.
data_parallel_size
use_mori_ep
=
envs
.
VLLM_USE_MORI_EP
and
data_parallel_size
>
1
and
parallel_config
.
enable_expert_parallel
if
use_mori_ep
:
...
...
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
View file @
2eb579dd
...
...
@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
from
vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher
import
MoEAlltoAllTokenDispatcher
from
vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis
import
EpMoeConfig
from
vllm.utils
import
direct_register_custom_op
import
mori
import
torch.distributed
as
dist
from
lmslim.layers.gemm.int8_utils
import
(
try
:
import
mori
from
lmslim.layers.gemm.int8_utils
import
(
per_token_quant_int8
)
except
ImportError
:
is_mori_available
=
False
logger
=
init_logger
(
__name__
)
...
...
@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
self
.
scales
=
None
self
.
use_int8_dispatch
=
True
vllm_config
=
get_current_vllm_config
()
self
.
max_num_inp_token_per_rank
=
vllm_config
.
scheduler_config
.
max_num_seqs
self
.
mori_op
=
self
.
get_mori_op
()
self
.
first
=
True
...
...
@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
hidden_dim
=
self
.
hidden_size
,
scale_dim
=
1
if
self
.
use_int8_dispatch
else
0
,
scale_type_size
=
mori_scale_type_size
,
max_num_inp_token_per_rank
=
512
,
max_num_inp_token_per_rank
=
self
.
max_num_inp_token_per_rank
,
num_experts_per_rank
=
self
.
local_num_experts
,
num_experts_per_token
=
self
.
top_k
,
max_token_type_size
=
2
,
...
...
@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
)
#self.sync()
expect_m
=
hidden_states
.
shape
[
0
]
*
self
.
ep_size
dispatch_output_clip
=
dispatch_output
[:
expect_m
]
dispatch_weights_clip
=
dispatch_weights
[:
expect_m
]
dispatch_indices_clip
=
dispatch_indices
[:
expect_m
]
dispatch_scales_clip
=
dispatch_scales
[:
expect_m
]
# expect_m = topk_ids.shape[0] * self.ep_size
# dispatch_output_clip = dispatch_output[:expect_m]
# dispatch_weights_clip = dispatch_weights[:expect_m]
# dispatch_indices_clip = dispatch_indices[:expect_m]
# dispatch_scales_clip = dispatch_scales[:expect_m]
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output_clip,
# topk_weights=dispatch_weights_clip,
# topk_ids=dispatch_indices_clip,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0],
# scales=dispatch_scales_clip if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
expert_output
=
self
.
quant_method
.
apply_ep
(
layer
=
self
,
x
=
dispatch_output
_clip
,
topk_weights
=
dispatch_weights
_clip
,
topk_ids
=
dispatch_indices
_clip
,
x
=
dispatch_output
,
topk_weights
=
dispatch_weights
,
topk_ids
=
dispatch_indices
,
global_num_experts
=
self
.
global_num_experts
,
expert_map
=
self
.
expert_map
,
activation
=
self
.
activation
,
...
...
@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
use_nn_moe
=
self
.
use_nn_moe
,
num_local_tokens
=
dispatch_recv_num_token
,
config_select_bs
=
hidden_states
.
shape
[
0
],
scales
=
dispatch_scales
_clip
if
self
.
use_int8_dispatch
else
None
scales
=
dispatch_scales
if
self
.
use_int8_dispatch
else
None
#routed_scaling_factor=self.routed_scaling_factor,
)
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output,
# topk_weights=dispatch_weights,
# topk_ids=dispatch_indices,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0]*2,
# scales=dispatch_scales if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
#self.sync()
combine_output
,
_
=
self
.
mori_op
.
combine
(
expert_output
,
dispatch_weights
,
topk_ids
)
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
2eb579dd
...
...
@@ -242,11 +242,6 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
else
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
not
self
.
use_mori_ep
:
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
or
self
.
dpsk_fp16_quick
:
final_hidden_states
=
final_hidden_states
+
shared_output
...
...
@@ -255,7 +250,11 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
else
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
not
self
.
use_mori_ep
:
if
self
.
tp_size
>
1
:
if
envs
.
VLLM_ENABLE_TBO
:
final_hidden_states
=
self
.
tbo_all_reduce
(
final_hidden_states
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment