Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
76695c0a
Commit
76695c0a
authored
Jan 17, 2026
by
王敏
Browse files
优化deepep相关代码
parent
cda54326
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
45 additions
and
33 deletions
+45
-33
vllm/config.py
vllm/config.py
+0
-4
vllm/distributed/device_communicators/base_device_communicator.py
...tributed/device_communicators/base_device_communicator.py
+1
-1
vllm/envs.py
vllm/envs.py
+6
-0
vllm/forward_context.py
vllm/forward_context.py
+2
-2
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+2
-2
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+26
-16
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+5
-6
vllm/zero_overhead/v1/eagle.py
vllm/zero_overhead/v1/eagle.py
+3
-2
No files found.
vllm/config.py
View file @
76695c0a
...
...
@@ -4822,13 +4822,9 @@ class VllmConfig:
if
ep_sp
or
enable_dp_attention
:
batch_size_capture_list
=
sorted
(
set
([
round_up
(
i
,
tp_size
)
for
i
in
batch_size_capture_list
]))
if
1
not
in
batch_size_capture_list
:
batch_size_capture_list
.
insert
(
0
,
1
)
else
:
if
ep_sp
or
enable_dp_attention
:
batch_size_capture_list
=
sorted
(
set
([
round_up
(
i
,
tp_size
)
for
i
in
batch_size_capture_list
]))
if
1
not
in
batch_size_capture_list
:
batch_size_capture_list
.
insert
(
0
,
1
)
self
.
compilation_config
.
init_with_cudagraph_sizes
(
batch_size_capture_list
)
...
...
vllm/distributed/device_communicators/base_device_communicator.py
View file @
76695c0a
...
...
@@ -103,7 +103,7 @@ class DeviceCommunicatorBase:
# as long as we use data parallel (coupled data parallel
# where all data parallel ranks execute forward together),
# we initialize the all2all manager used in expert parallel.
use_ep
=
config
.
parallel_config
.
data_parallel_size
>
1
use_ep
=
config
.
parallel_config
.
data_parallel_size
>
1
and
not
config
.
parallel_config
.
enable_dp_attention
self
.
use_all2all
=
"ep"
in
unique_name
and
use_ep
self
.
all2all_manager
:
Optional
[
All2AllManagerBase
]
=
None
...
...
vllm/envs.py
View file @
76695c0a
...
...
@@ -203,6 +203,7 @@ if TYPE_CHECKING:
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER
:
bool
=
False
VLLM_USE_FUSED_FILL_RMS_CAT
:
bool
=
False
VLLM_ENABLE_DEEPEP_HT_DEEPGEMM
:
bool
=
True
VLLM_ENABLE_DEEPEP_INT8_DISPATCH
:
bool
=
True
VLLM_ZERO_OVERHEAD_ENHANCE
:
bool
=
False
VLLM_USE_FUSED_QA_KVA_GEMM
:
bool
=
False
VLLM_V1_FAST_TOKEN_ID_COPY
:
bool
=
False
...
...
@@ -1318,6 +1319,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
(
os
.
getenv
(
'VLLM_ENABLE_DEEPEP_HT_DEEPGEMM'
,
'1'
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will use deepep int8 dispatch
"VLLM_ENABLE_DEEPEP_INT8_DISPATCH"
:
lambda
:
(
os
.
getenv
(
'VLLM_ENABLE_DEEPEP_INT8_DISPATCH'
,
'1'
).
lower
()
in
(
"true"
,
"1"
)),
# Only quantized DeepSeek models supported.
# Unquantized versions are not supported.
"VLLM_USE_FUSED_QA_KVA_GEMM"
:
...
...
vllm/forward_context.py
View file @
76695c0a
...
...
@@ -136,8 +136,8 @@ def set_forward_context(
forward_start_time
=
time
.
perf_counter
()
dp_metadata
:
Optional
[
DPMetadata
]
=
None
dp_size
=
vllm_config
.
parallel_config
.
data_parallel_size
use_navie_
ep
=
envs
.
VLLM_ALL2ALL_BACKEND
==
'naive'
and
dp_size
>
1
and
vllm_config
.
parallel_config
.
enable_expert_parallel
if
use_navie_
ep
and
dp_size
>
1
and
(
use_navie_
all2all
=
envs
.
VLLM_ALL2ALL_BACKEND
==
'naive'
and
dp_size
>
1
if
use_navie_
all2all
and
dp_size
>
1
and
(
attn_metadata
is
not
None
or
num_tokens
is
not
None
):
dp_metadata
=
DPMetadata
.
make
(
vllm_config
.
parallel_config
,
attn_metadata
,
num_tokens
or
0
,
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
76695c0a
...
...
@@ -192,7 +192,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
and
moe
.
quant_config
.
block_shape
==
DEEPEP_QUANT_BLOCK_SHAPE
)
use_int8_dispatch
=
False
use_int8_dispatch
=
moe
.
quant_config
.
quant_dtype
==
torch
.
int8
and
envs
.
VLLM_ENABLE_DEEPEP_HT_DEEPGEMM
ll_prepare_finalize
=
DeepEPLLPrepareAndFinalize
(
ll_handle
,
...
...
@@ -249,7 +249,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
and
moe
.
quant_config
.
block_shape
==
DEEPEP_QUANT_BLOCK_SHAPE
)
use_int8_dispatch
=
moe
.
quant_config
.
quant_dtype
==
torch
.
int8
use_int8_dispatch
=
moe
.
quant_config
.
quant_dtype
==
torch
.
int8
and
envs
.
VLLM_ENABLE_DEEPEP_HT_DEEPGEMM
# Note (varun): Whether to use FP8 dispatch or not needs some
# profiling. Turning it off for now.
...
...
vllm/v1/spec_decode/eagle.py
View file @
76695c0a
...
...
@@ -96,6 +96,10 @@ class EagleProposer:
self
.
enable_dp_attention
=
vllm_config
.
parallel_config
.
enable_dp_attention
self
.
attn_tp_size
=
vllm_config
.
parallel_config
.
tensor_parallel_size
self
.
ep_sp
=
False
if
self
.
enable_expert_parallel
and
self
.
dp_size
>
1
and
self
.
attn_tp_size
>
1
:
self
.
ep_sp
=
True
def
propose
(
self
,
# [num_tokens]
...
...
@@ -193,8 +197,10 @@ class EagleProposer:
if
self
.
enable_dp_attention
:
num_input_tokens
=
round_up
(
num_input_tokens
,
self
.
attn_tp_size
)
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
# num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
# num_input_tokens += num_pad
# copy inputs to buffer for cudagraph
self
.
positions
[:
num_tokens
]
=
target_positions
self
.
hidden_states
[:
num_tokens
]
=
target_hidden_states
...
...
@@ -543,8 +549,7 @@ class EagleProposer:
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
if
not
self
.
enable_dp_attention
and
not
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_auto"
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
or
envs
.
VLLM_ALL2ALL_BACKEND
!=
'naive'
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
:
# Early exit.
return
0
,
None
...
...
@@ -569,6 +574,7 @@ class EagleProposer:
self
,
num_tokens
:
int
,
attn_metadata
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
num_tokens_across_dp
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
None
:
if
attn_metadata
is
not
None
and
self
.
attn_metadata_cudagraph
is
None
:
self
.
attn_metadata_cudagraph
=
attn_metadata
[
...
...
@@ -576,12 +582,13 @@ class EagleProposer:
# Padding for DP
num_input_tokens
=
num_tokens
num_pad
,
_
=
self
.
get_dp_padding
(
num_tokens
)
num_input_tokens
+=
num_pad
#
num_pad,
num_tokens_across_dp
= self.get_dp_padding(num_tokens)
#
num_input_tokens += num_pad
with
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
num_tokens
):
num_tokens
=
num_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
):
self
.
model
(
self
.
input_ids
[:
num_input_tokens
],
self
.
positions
[:
num_input_tokens
],
...
...
@@ -590,10 +597,12 @@ class EagleProposer:
if
self
.
dp_size
>
1
and
(
self
.
enable_expert_parallel
or
self
.
enable_dp_attention
)
and
self
.
num_speculative_tokens
>
1
:
num_tokens
=
1
if
self
.
enable_dp_attention
or
self
.
ep_sp
:
num_tokens
=
round_up
(
num_tokens
,
self
.
attn_tp_size
)
# dp attention need all dp rank process same number tokens
if
self
.
enable_dp_attention
:
num_tokens
=
round_up
(
num_tokens
,
self
.
attn_tp_size
)
num_pad
,
_
=
self
.
get_dp_padding
(
num_tokens
)
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_tokens
)
num_tokens
+=
num_pad
if
not
get_warming_up
():
...
...
@@ -621,19 +630,20 @@ class EagleProposer:
attn_metadata_cudagraph
.
num_actual_tokens
=
num_tokens
attn_metadata_cudagraph
.
num_decodes
=
num_tokens
attn_metadata_cudagraph
.
num_decode_tokens
=
num_tokens
self
.
attn_metadata_cudagraph
.
slot_mapping
[:
num_tokens
]
=
(
attn_metadata_cudagraph
.
slot_mapping
[:
num_tokens
]
=
(
attn_metadata
.
slot_mapping
)
attn_metadata_cudagraph
.
decode
.
seq_lens
[:
num_tokens
]
=
(
attn_metadata
.
decode
.
seq_lens
)
self
.
attn_metadata_cudagraph
.
query_start_loc
[:
num_tokens
+
1
]
=
(
attn_metadata_cudagraph
.
query_start_loc
[:
num_tokens
+
1
]
=
(
attn_metadata
.
query_start_loc
)
self
.
attn_metadata_cudagraph
.
decode
.
block_table
[:
num_tokens
]
=
(
attn_metadata_cudagraph
.
decode
.
block_table
[:
num_tokens
]
=
(
attn_metadata
.
decode
.
block_table
)
with
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
num_tokens
):
num_tokens
=
num_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
):
self
.
model
(
self
.
input_ids
[:
num_tokens
],
self
.
positions
[:
num_tokens
],
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
76695c0a
...
...
@@ -1276,9 +1276,7 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
#
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
if
not
self
.
enable_dp_attention
and
not
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_auto"
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
or
envs
.
VLLM_ALL2ALL_BACKEND
!=
'naive'
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
:
# Early exit.
return
0
,
None
...
...
@@ -2246,7 +2244,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
()
and
not
is_profile
:
#assert isinstance(self.drafter, EagleProposer)
if
hasattr
(
self
,
'drafter'
)
and
isinstance
(
self
.
drafter
,
EagleProposer
):
self
.
drafter
.
dummy_run
(
num_tokens
,
attn_metadata
)
self
.
drafter
.
dummy_run
(
num_tokens
,
attn_metadata
,
num_tokens_across_dp
=
num_tokens_across_dp
)
# This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real
...
...
vllm/zero_overhead/v1/eagle.py
View file @
76695c0a
...
...
@@ -114,8 +114,9 @@ class V1ZeroEagleProposer(EagleProposer):
if
self
.
enable_dp_attention
:
num_input_tokens
=
round_up
(
num_input_tokens
,
self
.
attn_tp_size
)
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
# num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
# num_input_tokens += num_pad
# copy inputs to buffer for cudagraph
self
.
positions
[:
num_tokens
]
=
target_positions
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment