Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
916b5876
Commit
916b5876
authored
Dec 10, 2025
by
王敏
Browse files
[fix]修复deepep 高吞吐模式vmfault问题
parent
1a315a58
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
7 deletions
+9
-7
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
...ation/compressed_tensors/compressed_tensors_moe_marlin.py
+2
-1
vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
...del_executor/layers/quantization/slimquant_w4a8_marlin.py
+3
-1
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+4
-5
No files found.
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
View file @
916b5876
...
@@ -285,9 +285,10 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
...
@@ -285,9 +285,10 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
shared_output
:
Optional
[
torch
.
Tensor
]
=
None
,
shared_output
:
Optional
[
torch
.
Tensor
]
=
None
,
q_x
:
Optional
[
torch
.
Tensor
]
=
None
,
**
_
):
**
_
):
return
fused_experts_impl_int8_marlin
(
return
fused_experts_impl_int8_marlin
(
hidden_states
=
x
,
hidden_states
=
x
if
q_x
is
None
else
q_x
,
w1
=
w1
,
w1
=
w1
,
w2
=
w2
,
w2
=
w2
,
topk_weights
=
topk_weights
,
topk_weights
=
topk_weights
,
...
...
vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
View file @
916b5876
...
@@ -263,7 +263,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
...
@@ -263,7 +263,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
**
_
):
**
_
):
workspace
,
global_reduce_buffer
=
MarlinMoeWorkspace
(
x
.
device
).
get_buffers
()
workspace
,
global_reduce_buffer
=
MarlinMoeWorkspace
(
x
.
device
).
get_buffers
()
return
fused_experts_impl_w4a8_marlin
(
return
fused_experts_impl_w4a8_marlin
(
x
,
x
if
q_x
is
None
else
q_x
,
w1
,
w1
,
w2
,
w2
,
topk_ids
=
topk_ids
,
topk_ids
=
topk_ids
,
...
@@ -510,6 +510,8 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
...
@@ -510,6 +510,8 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
False
)
False
)
return
TritonOrGroupGemmExperts
(
return
TritonOrGroupGemmExperts
(
# use_int4_w4a8=True,
# per_act_token_quant=True,
fused_experts
=
self
.
w4a8_fused_moe_marlin_forward
fused_experts
=
self
.
w4a8_fused_moe_marlin_forward
)
)
vllm/model_executor/models/deepseek_v2.py
View file @
916b5876
...
@@ -717,9 +717,8 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -717,9 +717,8 @@ class DeepseekV2DecoderLayer(nn.Module):
self
.
dp_size
=
get_dp_group
().
world_size
self
.
dp_size
=
get_dp_group
().
world_size
vllm_config
=
get_current_vllm_config
()
vllm_config
=
get_current_vllm_config
()
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
vllm_config
.
parallel_config
self
.
use_deepep
=
self
.
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
and
\
self
.
use_deepep_ll
=
self
.
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
and
\
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
or
\
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
)
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
if
(
config
.
n_routed_experts
is
not
None
if
(
config
.
n_routed_experts
is
not
None
...
@@ -848,7 +847,7 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -848,7 +847,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states
,
residual
)
hidden_states
,
residual
)
if
isinstance
(
self
.
mlp
,
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
:
DeepseekV2MoE
)
and
self
.
use_deepep
_ll
and
self
.
tp_size
>
1
:
self
.
tp_rank
=
get_tensor_model_parallel_rank
()
self
.
tp_rank
=
get_tensor_model_parallel_rank
()
ori_bs
=
hidden_states
.
shape
[
0
]
ori_bs
=
hidden_states
.
shape
[
0
]
...
@@ -861,7 +860,7 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -861,7 +860,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
if
isinstance
(
self
.
mlp
,
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
:
DeepseekV2MoE
)
and
self
.
use_deepep
_ll
and
self
.
tp_size
>
1
:
hidden_states
=
tensor_model_parallel_all_gather
(
hidden_states
,
dim
=
0
).
contiguous
()
hidden_states
=
tensor_model_parallel_all_gather
(
hidden_states
,
dim
=
0
).
contiguous
()
hidden_states
=
hidden_states
[:
ori_bs
,
:].
contiguous
()
hidden_states
=
hidden_states
[:
ori_bs
,
:].
contiguous
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment