Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
16f6dfc0
Commit
16f6dfc0
authored
Feb 11, 2026
by
王敏
Browse files
解决开启ep mtp>1时cudagraph卡住问题
parent
4612aad6
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
138 additions
and
34 deletions
+138
-34
vllm/forward_context.py
vllm/forward_context.py
+14
-3
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+25
-13
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+92
-14
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+3
-3
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+3
-0
vllm/zero_overhead/v1/gpu_model_runner.py
vllm/zero_overhead/v1/gpu_model_runner.py
+1
-1
No files found.
vllm/forward_context.py
View file @
16f6dfc0
...
@@ -136,8 +136,8 @@ def set_forward_context(
...
@@ -136,8 +136,8 @@ def set_forward_context(
forward_start_time
=
time
.
perf_counter
()
forward_start_time
=
time
.
perf_counter
()
dp_metadata
:
Optional
[
DPMetadata
]
=
None
dp_metadata
:
Optional
[
DPMetadata
]
=
None
dp_size
=
vllm_config
.
parallel_config
.
data_parallel_size
dp_size
=
vllm_config
.
parallel_config
.
data_parallel_size
use_navie_
ep
=
envs
.
VLLM_ALL2ALL_BACKEND
==
'naive'
and
dp_size
>
1
and
vllm_config
.
parallel_config
.
enable_expert_parallel
use_navie_
all2all
=
envs
.
VLLM_ALL2ALL_BACKEND
==
'naive'
and
dp_size
>
1
if
use_navie_
ep
and
dp_size
>
1
and
(
if
use_navie_
all2all
and
dp_size
>
1
and
(
attn_metadata
is
not
None
or
num_tokens
is
not
None
):
attn_metadata
is
not
None
or
num_tokens
is
not
None
):
dp_metadata
=
DPMetadata
.
make
(
vllm_config
.
parallel_config
,
dp_metadata
=
DPMetadata
.
make
(
vllm_config
.
parallel_config
,
attn_metadata
,
num_tokens
or
0
,
attn_metadata
,
num_tokens
or
0
,
...
@@ -211,3 +211,14 @@ def set_profilling(profiling):
...
@@ -211,3 +211,14 @@ def set_profilling(profiling):
def
get_profilling
()
->
bool
:
def
get_profilling
()
->
bool
:
global
_profiling
global
_profiling
return
_profiling
return
_profiling
_warming_up
=
False
@
contextmanager
def
set_warming_up
(
warming_up
):
global
_warming_up
_warming_up
=
warming_up
def
get_warming_up
()
->
bool
:
global
_warming_up
return
_warming_up
\ No newline at end of file
vllm/model_executor/layers/fused_moe/layer.py
View file @
16f6dfc0
...
@@ -630,22 +630,34 @@ def determine_expert_map(
...
@@ -630,22 +630,34 @@ def determine_expert_map(
if
ep_size
==
1
:
if
ep_size
==
1
:
return
(
global_num_experts
,
None
)
return
(
global_num_experts
,
None
)
local_num_experts
=
global_num_experts
//
ep_size
# local_num_experts = global_num_experts // ep_size
# # Create a tensor of size num_experts filled with -1
# expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
# # Create a expert map for the local experts
# if ep_rank < (ep_size - 1):
# # Each non-last rank gets local_num_experts experts.
# expert_map[ep_rank * local_num_experts:
# (ep_rank + 1) * local_num_experts] = \
# torch.arange(0, local_num_experts, dtype=torch.int32)
# else:
# # All remaining experts are assigned to the last rank.
# local_num_experts = (global_num_experts - ep_rank * local_num_experts)
# expert_map[-local_num_experts:] = \
# torch.arange(0, local_num_experts, dtype=torch.int32)
# Distribute experts as evenly as possible to each rank.
base_experts
=
global_num_experts
//
ep_size
remainder
=
global_num_experts
%
ep_size
local_num_experts
=
base_experts
+
1
if
ep_rank
<
remainder
else
base_experts
# Create a tensor of size num_experts filled with -1
# Create a tensor of size num_experts filled with -1
expert_map
=
torch
.
full
((
global_num_experts
,
),
-
1
,
dtype
=
torch
.
int32
)
expert_map
=
torch
.
full
((
global_num_experts
,),
-
1
,
dtype
=
torch
.
int32
)
# Create a expert map for the local experts
start_idx
=
ep_rank
*
base_experts
+
min
(
ep_rank
,
remainder
)
if
ep_rank
<
(
ep_size
-
1
):
expert_map
[
start_idx
:
start_idx
+
local_num_experts
]
=
torch
.
arange
(
# Each non-last rank gets local_num_experts experts.
0
,
local_num_experts
,
dtype
=
torch
.
int32
expert_map
[
ep_rank
*
local_num_experts
:
)
(
ep_rank
+
1
)
*
local_num_experts
]
=
\
torch
.
arange
(
0
,
local_num_experts
,
dtype
=
torch
.
int32
)
else
:
# All remaining experts are assigned to the last rank.
local_num_experts
=
(
global_num_experts
-
ep_rank
*
local_num_experts
)
expert_map
[
-
local_num_experts
:]
=
\
torch
.
arange
(
0
,
local_num_experts
,
dtype
=
torch
.
int32
)
return
(
local_num_experts
,
expert_map
)
return
(
local_num_experts
,
expert_map
)
...
...
vllm/v1/spec_decode/eagle.py
View file @
16f6dfc0
...
@@ -12,8 +12,9 @@ from vllm.attention.layer import Attention
...
@@ -12,8 +12,9 @@ from vllm.attention.layer import Attention
from
vllm.config
import
(
CompilationLevel
,
VllmConfig
,
from
vllm.config
import
(
CompilationLevel
,
VllmConfig
,
get_layers_from_vllm_config
)
get_layers_from_vllm_config
)
from
vllm.distributed.parallel_state
import
get_pp_group
from
vllm.distributed.parallel_state
import
get_pp_group
from
vllm.forward_context
import
set_forward_context
from
vllm.forward_context
import
DPMetadata
,
set_forward_context
,
get_warming_up
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
import
vllm.envs
as
envs
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.models
import
supports_multimodal
from
vllm.model_executor.models
import
supports_multimodal
from
vllm.model_executor.models.llama_eagle3
import
Eagle3LlamaForCausalLM
from
vllm.model_executor.models.llama_eagle3
import
Eagle3LlamaForCausalLM
...
@@ -516,29 +517,106 @@ class EagleProposer:
...
@@ -516,29 +517,106 @@ class EagleProposer:
logger
.
info
(
"Loading EAGLE LM head weights from the target model."
)
logger
.
info
(
"Loading EAGLE LM head weights from the target model."
)
self
.
model
.
lm_head
=
target_language_model
.
lm_head
self
.
model
.
lm_head
=
target_language_model
.
lm_head
def
get_dp_padding
(
self
,
num_tokens
:
int
)
->
tuple
[
int
,
Optional
[
torch
.
Tensor
]]:
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
dp_rank
=
self
.
vllm_config
.
parallel_config
.
data_parallel_rank
# For DP: Don't pad when setting enforce_eager.
# This lets us set enforce_eager on the prefiller in a P/D setup and
# still use CUDA graphs (enabled by this padding) on the decoder.
#
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
:
# Early exit.
return
0
,
None
try
:
num_tokens_across_dp
=
DPMetadata
.
num_tokens_across_dp
(
num_tokens
,
dp_size
,
dp_rank
)
max_tokens_across_dp_cpu
=
torch
.
max
(
num_tokens_across_dp
).
item
()
num_tokens_after_padding
=
torch
.
tensor
([
max_tokens_across_dp_cpu
]
*
dp_size
,
device
=
"cpu"
,
dtype
=
torch
.
int32
)
return
max_tokens_across_dp_cpu
-
num_tokens
,
num_tokens_after_padding
except
(
RuntimeError
,
AttributeError
)
as
e
:
# DP group may not be initialized yet during dummy run
# Skip padding in this case
logger
.
debug
(
"Skipping DP padding in eagle get_dp_padding due to: %s"
,
e
)
return
0
,
None
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
dummy_run
(
def
dummy_run
(
self
,
self
,
num_tokens
:
int
,
num_tokens
:
int
,
attn_metadata
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
attn_metadata
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
num_tokens_across_dp
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
None
:
)
->
None
:
if
attn_metadata
is
not
None
and
self
.
attn_metadata_cudagraph
is
None
:
if
attn_metadata
is
not
None
and
self
.
attn_metadata_cudagraph
is
None
:
self
.
attn_metadata_cudagraph
=
attn_metadata
[
self
.
attn_metadata_cudagraph
=
attn_metadata
[
self
.
attn_layer_names
[
0
]]
self
.
attn_layer_names
[
0
]]
# Padding for DP
num_input_tokens
=
num_tokens
# num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
# num_input_tokens += num_pad
with
set_forward_context
(
attn_metadata
,
with
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
self
.
vllm_config
,
num_tokens
=
num_tokens
):
num_tokens
=
num_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
):
self
.
model
(
self
.
model
(
self
.
input_ids
[:
num_tokens
],
self
.
input_ids
[:
num_
input_
tokens
],
self
.
positions
[:
num_tokens
],
self
.
positions
[:
num_
input_
tokens
],
self
.
hidden_states
[:
num_tokens
],
self
.
hidden_states
[:
num_
input_
tokens
],
)
)
if
self
.
dp_size
>
1
and
self
.
enable_expert_parallel
and
self
.
num_speculative_tokens
>
1
:
if
self
.
dp_size
>
1
and
self
.
enable_expert_parallel
and
self
.
num_speculative_tokens
>
1
:
for
_
in
range
(
self
.
num_speculative_tokens
-
1
):
num_tokens
=
1
if
not
get_warming_up
():
common_attn_metadata
=
CommonAttentionMetadata
(
query_start_loc
=
self
.
runner
.
query_start_loc
[:
num_tokens
+
1
],
seq_lens
=
self
.
runner
.
seq_lens
[:
num_tokens
],
num_reqs
=
num_tokens
,
num_actual_tokens
=
num_tokens
,
max_query_len
=
num_tokens
,
slot_mapping
=
self
.
runner
.
slot_mapping
[:
num_tokens
],
spec_layer_decoding
=
True
)
assert
self
.
runner
is
not
None
# FIXME: need to consider multiple kv_cache_groups
attn_metadata
=
self
.
runner
.
attn_metadata_builders
[
0
].
build_for_cudagraph_capture
(
common_attn_metadata
=
common_attn_metadata
)
for
i
in
range
(
self
.
num_speculative_tokens
-
1
):
if
self
.
attn_metadata_cudagraph
is
not
None
:
if
i
==
0
:
attn_metadata_cudagraph
=
self
.
attn_metadata_cudagraph
attn_metadata_cudagraph
.
num_actual_tokens
=
num_tokens
attn_metadata_cudagraph
.
num_decodes
=
num_tokens
attn_metadata_cudagraph
.
num_decode_tokens
=
num_tokens
attn_metadata_cudagraph
.
slot_mapping
[:
num_tokens
]
=
(
attn_metadata
.
slot_mapping
)
attn_metadata_cudagraph
.
decode
.
seq_lens
[:
num_tokens
]
=
(
attn_metadata
.
decode
.
seq_lens
)
attn_metadata_cudagraph
.
query_start_loc
[:
num_tokens
+
1
]
=
(
attn_metadata
.
query_start_loc
)
attn_metadata_cudagraph
.
decode
.
block_table
[:
num_tokens
]
=
(
attn_metadata
.
decode
.
block_table
)
with
set_forward_context
(
attn_metadata
,
with
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
self
.
vllm_config
,
num_tokens
=
num_tokens
):
num_tokens
=
num_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
):
self
.
model
(
self
.
model
(
self
.
input_ids
[:
num_tokens
],
self
.
input_ids
[:
num_tokens
],
self
.
positions
[:
num_tokens
],
self
.
positions
[:
num_tokens
],
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
16f6dfc0
...
@@ -1274,8 +1274,7 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -1274,8 +1274,7 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
#
#
# TODO(tms) : There are many cases where padding is enabled for
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
# prefills, causing unnecessary and excessive padding of activations.
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
or
envs
.
VLLM_ALL2ALL_BACKEND
!=
'naive'
:
# Early exit.
# Early exit.
return
0
,
None
return
0
,
None
...
@@ -2240,7 +2239,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -2240,7 +2239,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
()
and
not
is_profile
:
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
()
and
not
is_profile
:
#assert isinstance(self.drafter, EagleProposer)
#assert isinstance(self.drafter, EagleProposer)
if
hasattr
(
self
,
'drafter'
)
and
isinstance
(
self
.
drafter
,
EagleProposer
):
if
hasattr
(
self
,
'drafter'
)
and
isinstance
(
self
.
drafter
,
EagleProposer
):
self
.
drafter
.
dummy_run
(
num_tokens
,
attn_metadata
)
self
.
drafter
.
dummy_run
(
num_tokens
,
attn_metadata
,
num_tokens_across_dp
=
num_tokens_across_dp
)
# This is necessary to avoid blocking DP.
# This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real
# For dummy runs, we typically skip EPLB since we don't have any real
...
...
vllm/v1/worker/gpu_worker.py
View file @
16f6dfc0
...
@@ -30,6 +30,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner
...
@@ -30,6 +30,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from
vllm.v1.worker.worker_base
import
WorkerBase
from
vllm.v1.worker.worker_base
import
WorkerBase
from
vllm.zero_overhead.utils
import
zero_overhead_stream
from
vllm.zero_overhead.utils
import
zero_overhead_stream
from
vllm.zero_overhead.v1.gpu_model_runner
import
V1ZeroModelRunner
from
vllm.zero_overhead.v1.gpu_model_runner
import
V1ZeroModelRunner
from
vllm.forward_context
import
(
set_warming_up
,
get_warming_up
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -260,6 +261,7 @@ class Worker(WorkerBase):
...
@@ -260,6 +261,7 @@ class Worker(WorkerBase):
# warm up sizes that are not in cudagraph capture sizes,
# warm up sizes that are not in cudagraph capture sizes,
# but users still want to compile for better performance,
# but users still want to compile for better performance,
# e.g. for the max-num-batched token size in chunked prefill.
# e.g. for the max-num-batched token size in chunked prefill.
set_warming_up
(
True
)
warmup_sizes
=
self
.
vllm_config
.
compilation_config
.
compile_sizes
.
copy
()
warmup_sizes
=
self
.
vllm_config
.
compilation_config
.
compile_sizes
.
copy
()
if
not
self
.
model_config
.
enforce_eager
:
if
not
self
.
model_config
.
enforce_eager
:
warmup_sizes
=
[
warmup_sizes
=
[
...
@@ -297,6 +299,7 @@ class Worker(WorkerBase):
...
@@ -297,6 +299,7 @@ class Worker(WorkerBase):
# Reset the seed to ensure that the random state is not affected by
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
# the model initialization and profiling.
set_random_seed
(
self
.
model_config
.
seed
)
set_random_seed
(
self
.
model_config
.
seed
)
set_warming_up
(
False
)
def
get_model
(
self
)
->
nn
.
Module
:
def
get_model
(
self
)
->
nn
.
Module
:
return
self
.
model_runner
.
get_model
()
return
self
.
model_runner
.
get_model
()
...
...
vllm/zero_overhead/v1/gpu_model_runner.py
View file @
16f6dfc0
...
@@ -427,7 +427,7 @@ class V1ZeroModelRunner(GPUModelRunner):
...
@@ -427,7 +427,7 @@ class V1ZeroModelRunner(GPUModelRunner):
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
if
self
.
ep_sp
:
if
self
.
ep_sp
:
num_input_tokens
=
round_up
(
num_scheduled_tokens
,
tp_size
)
num_input_tokens
=
round_up
(
num_scheduled_tokens
,
self
.
tp_size
)
if
(
self
.
use_cuda_graph
if
(
self
.
use_cuda_graph
and
num_input_tokens
<=
self
.
cudagraph_batch_sizes
[
-
1
]):
and
num_input_tokens
<=
self
.
cudagraph_batch_sizes
[
-
1
]):
# Use piecewise CUDA graphs.
# Use piecewise CUDA graphs.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment