Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2eb579dd
Commit
2eb579dd
authored
Oct 12, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev-ds' of
http://10.16.6.30/dcutoolkit/deeplearing/vllm
into v0.9.2-dev-ds
parents
7b8a9e18
0e92caa0
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
47 additions
and
41 deletions
+47
-41
vllm/attention/layer.py
vllm/attention/layer.py
+2
-2
vllm/config.py
vllm/config.py
+1
-1
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+1
-0
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
+35
-29
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+8
-9
No files found.
vllm/attention/layer.py
View file @
2eb579dd
...
@@ -81,7 +81,7 @@ class Attention(nn.Module):
...
@@ -81,7 +81,7 @@ class Attention(nn.Module):
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
calculate_kv_scales
=
cache_config
.
calculate_kv_scales
else
:
else
:
kv_cache_dtype
=
"auto"
kv_cache_dtype
=
"auto"
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
is_attention_free
=
False
is_attention_free
=
False
calculate_kv_scales
=
False
calculate_kv_scales
=
False
if
num_kv_heads
is
None
:
if
num_kv_heads
is
None
:
...
@@ -312,7 +312,7 @@ class MultiHeadAttention(nn.Module):
...
@@ -312,7 +312,7 @@ class MultiHeadAttention(nn.Module):
attn_backend
=
get_attn_backend
(
head_size
,
attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
dtype
,
kv_cache_dtype
=
None
,
kv_cache_dtype
=
None
,
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
,
block_size
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
,
is_attention_free
=
False
)
is_attention_free
=
False
)
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
backend
=
backend_name_to_enum
(
attn_backend
.
get_name
())
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
...
...
vllm/config.py
View file @
2eb579dd
...
@@ -1499,7 +1499,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
...
@@ -1499,7 +1499,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
class
CacheConfig
:
class
CacheConfig
:
"""Configuration for the KV cache."""
"""Configuration for the KV cache."""
block_size
:
BlockSize
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
or
envs
.
VLLM_USE_FLASH_MLA
else
16
# type: ignore
block_size
:
BlockSize
=
64
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
envs
.
VLLM_USE_FLASH_MLA
else
16
# type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
...
...
vllm/distributed/parallel_state.py
View file @
2eb579dd
...
@@ -949,6 +949,7 @@ def init_distributed_environment(
...
@@ -949,6 +949,7 @@ def init_distributed_environment(
backend
=
"gloo"
backend
=
"gloo"
# this backend is used for WORLD
# this backend is used for WORLD
parallel_config
=
config
.
parallel_config
data_parallel_size
=
parallel_config
.
data_parallel_size
data_parallel_size
=
parallel_config
.
data_parallel_size
use_mori_ep
=
envs
.
VLLM_USE_MORI_EP
and
data_parallel_size
>
1
and
parallel_config
.
enable_expert_parallel
use_mori_ep
=
envs
.
VLLM_USE_MORI_EP
and
data_parallel_size
>
1
and
parallel_config
.
enable_expert_parallel
if
use_mori_ep
:
if
use_mori_ep
:
...
...
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
View file @
2eb579dd
...
@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
...
@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
from
vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher
import
MoEAlltoAllTokenDispatcher
from
vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher
import
MoEAlltoAllTokenDispatcher
from
vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis
import
EpMoeConfig
from
vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis
import
EpMoeConfig
from
vllm.utils
import
direct_register_custom_op
from
vllm.utils
import
direct_register_custom_op
import
mori
import
torch.distributed
as
dist
import
torch.distributed
as
dist
from
lmslim.layers.gemm.int8_utils
import
(
try
:
import
mori
from
lmslim.layers.gemm.int8_utils
import
(
per_token_quant_int8
)
per_token_quant_int8
)
except
ImportError
:
is_mori_available
=
False
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
...
@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
self
.
scales
=
None
self
.
scales
=
None
self
.
use_int8_dispatch
=
True
self
.
use_int8_dispatch
=
True
vllm_config
=
get_current_vllm_config
()
self
.
max_num_inp_token_per_rank
=
vllm_config
.
scheduler_config
.
max_num_seqs
self
.
mori_op
=
self
.
get_mori_op
()
self
.
mori_op
=
self
.
get_mori_op
()
self
.
first
=
True
self
.
first
=
True
...
@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
...
@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
hidden_dim
=
self
.
hidden_size
,
hidden_dim
=
self
.
hidden_size
,
scale_dim
=
1
if
self
.
use_int8_dispatch
else
0
,
scale_dim
=
1
if
self
.
use_int8_dispatch
else
0
,
scale_type_size
=
mori_scale_type_size
,
scale_type_size
=
mori_scale_type_size
,
max_num_inp_token_per_rank
=
512
,
max_num_inp_token_per_rank
=
self
.
max_num_inp_token_per_rank
,
num_experts_per_rank
=
self
.
local_num_experts
,
num_experts_per_rank
=
self
.
local_num_experts
,
num_experts_per_token
=
self
.
top_k
,
num_experts_per_token
=
self
.
top_k
,
max_token_type_size
=
2
,
max_token_type_size
=
2
,
...
@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
...
@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
)
)
#self.sync()
#self.sync()
expect_m
=
hidden_states
.
shape
[
0
]
*
self
.
ep_size
# expect_m = topk_ids.shape[0] * self.ep_size
dispatch_output_clip
=
dispatch_output
[:
expect_m
]
# dispatch_output_clip = dispatch_output[:expect_m]
dispatch_weights_clip
=
dispatch_weights
[:
expect_m
]
# dispatch_weights_clip = dispatch_weights[:expect_m]
dispatch_indices_clip
=
dispatch_indices
[:
expect_m
]
# dispatch_indices_clip = dispatch_indices[:expect_m]
dispatch_scales_clip
=
dispatch_scales
[:
expect_m
]
# dispatch_scales_clip = dispatch_scales[:expect_m]
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output_clip,
# topk_weights=dispatch_weights_clip,
# topk_ids=dispatch_indices_clip,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0],
# scales=dispatch_scales_clip if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
expert_output
=
self
.
quant_method
.
apply_ep
(
expert_output
=
self
.
quant_method
.
apply_ep
(
layer
=
self
,
layer
=
self
,
x
=
dispatch_output
_clip
,
x
=
dispatch_output
,
topk_weights
=
dispatch_weights
_clip
,
topk_weights
=
dispatch_weights
,
topk_ids
=
dispatch_indices
_clip
,
topk_ids
=
dispatch_indices
,
global_num_experts
=
self
.
global_num_experts
,
global_num_experts
=
self
.
global_num_experts
,
expert_map
=
self
.
expert_map
,
expert_map
=
self
.
expert_map
,
activation
=
self
.
activation
,
activation
=
self
.
activation
,
...
@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
...
@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
use_nn_moe
=
self
.
use_nn_moe
,
use_nn_moe
=
self
.
use_nn_moe
,
num_local_tokens
=
dispatch_recv_num_token
,
num_local_tokens
=
dispatch_recv_num_token
,
config_select_bs
=
hidden_states
.
shape
[
0
],
config_select_bs
=
hidden_states
.
shape
[
0
],
scales
=
dispatch_scales
_clip
if
self
.
use_int8_dispatch
else
None
scales
=
dispatch_scales
if
self
.
use_int8_dispatch
else
None
#routed_scaling_factor=self.routed_scaling_factor,
#routed_scaling_factor=self.routed_scaling_factor,
)
)
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output,
# topk_weights=dispatch_weights,
# topk_ids=dispatch_indices,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0]*2,
# scales=dispatch_scales if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
#self.sync()
#self.sync()
combine_output
,
_
=
self
.
mori_op
.
combine
(
expert_output
,
dispatch_weights
,
topk_ids
)
combine_output
,
_
=
self
.
mori_op
.
combine
(
expert_output
,
dispatch_weights
,
topk_ids
)
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
2eb579dd
...
@@ -242,11 +242,6 @@ class DeepseekV2MoE(nn.Module):
...
@@ -242,11 +242,6 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
router_logits
=
router_logits
)
else
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
not
self
.
use_mori_ep
:
if
shared_output
is
not
None
:
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
or
self
.
dpsk_fp16_quick
:
if
hidden_states
.
dtype
!=
torch
.
float16
or
self
.
dpsk_fp16_quick
:
final_hidden_states
=
final_hidden_states
+
shared_output
final_hidden_states
=
final_hidden_states
+
shared_output
...
@@ -255,7 +250,11 @@ class DeepseekV2MoE(nn.Module):
...
@@ -255,7 +250,11 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
*
(
1.
/
self
.
routed_scaling_factor
)
else
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
not
self
.
use_mori_ep
:
if
self
.
tp_size
>
1
:
if
self
.
tp_size
>
1
:
if
envs
.
VLLM_ENABLE_TBO
:
if
envs
.
VLLM_ENABLE_TBO
:
final_hidden_states
=
self
.
tbo_all_reduce
(
final_hidden_states
)
final_hidden_states
=
self
.
tbo_all_reduce
(
final_hidden_states
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment