Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
13130b89
Commit
13130b89
authored
Dec 18, 2025
by
王敏
Browse files
[feat]合入基于deepep的大EP
parent
06106338
Changes
27
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
303 additions
and
89 deletions
+303
-89
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+14
-0
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+226
-83
vllm/model_executor/parameter.py
vllm/model_executor/parameter.py
+9
-0
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+8
-0
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+32
-6
vllm/zero_overhead/v1/eagle.py
vllm/zero_overhead/v1/eagle.py
+8
-0
vllm/zero_overhead/v1/gpu_model_runner.py
vllm/zero_overhead/v1/gpu_model_runner.py
+6
-0
No files found.
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
13130b89
...
...
@@ -39,6 +39,20 @@ def get_w8a8_int8_marlin_weights(
return
weight
def
w8a8_nt_kpack2_marlin_weight
(
w8a8_w
,
# [size_n, size_k// 2 ]
k_tile
=
16
,
n_tile
=
16
,
):
assert
w8a8_w
.
dtype
==
torch
.
int8
,
"w8a8_w 必须是 int8 类型"
size_n
,
size_k
=
w8a8_w
.
shape
assert
size_n
%
k_tile
==
0
and
size_k
%
n_tile
==
0
,
"k_tile / n_tile 必须能整除对应维度"
w8a8_w
=
w8a8_w
.
reshape
((
size_n
//
n_tile
,
n_tile
,
size_k
//
k_tile
,
k_tile
))
w8a8_w
=
w8a8_w
.
permute
((
0
,
2
,
1
,
3
)).
contiguous
()
w8a8_w
=
w8a8_w
.
reshape
((
size_n
//
k_tile
,
size_k
*
k_tile
))
return
w8a8_w
def
sparse_cutlass_supported
()
->
bool
:
if
not
current_platform
.
is_cuda
():
return
False
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
13130b89
This diff is collapsed.
Click to expand it.
vllm/model_executor/parameter.py
View file @
13130b89
...
...
@@ -96,6 +96,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
def
__init__
(
self
,
output_dim
:
int
,
**
kwargs
):
self
.
_output_dim
=
output_dim
super
().
__init__
(
**
kwargs
)
self
.
expect_tp_size
=
-
1
@
property
def
output_dim
(
self
):
...
...
@@ -103,6 +105,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
def
load_column_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
):
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
shard_size
=
self
.
data
.
shape
[
self
.
output_dim
]
loaded_weight
=
loaded_weight
.
narrow
(
self
.
output_dim
,
tp_rank
*
shard_size
,
shard_size
)
...
...
@@ -123,6 +127,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
param_data
=
self
.
data
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
param_data
=
param_data
.
narrow
(
self
.
output_dim
,
shard_offset
,
shard_size
)
loaded_weight
=
loaded_weight
.
narrow
(
self
.
output_dim
,
...
...
@@ -167,6 +173,7 @@ class RowvLLMParameter(BasevLLMParameter):
def
__init__
(
self
,
input_dim
:
int
,
**
kwargs
):
self
.
_input_dim
=
input_dim
super
().
__init__
(
**
kwargs
)
self
.
expect_tp_size
=
-
1
@
property
def
input_dim
(
self
):
...
...
@@ -174,6 +181,8 @@ class RowvLLMParameter(BasevLLMParameter):
def
load_row_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
):
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
shard_size
=
self
.
data
.
shape
[
self
.
input_dim
]
loaded_weight
=
loaded_weight
.
narrow
(
self
.
input_dim
,
tp_rank
*
shard_size
,
shard_size
)
...
...
vllm/v1/spec_decode/eagle.py
View file @
13130b89
...
...
@@ -25,6 +25,7 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata, MLACommonDe
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.spec_decode.utils
import
prepare_eagle_input_kernel
from
vllm.utils
import
round_up
logger
=
init_logger
(
__name__
)
...
...
@@ -186,6 +187,13 @@ class EagleProposer:
num_input_tokens
=
self
.
vllm_config
.
pad_for_cudagraph
(
num_tokens
)
else
:
num_input_tokens
=
num_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
vllm_config
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# copy inputs to buffer for cudagraph
self
.
positions
[:
num_tokens
]
=
target_positions
self
.
hidden_states
[:
num_tokens
]
=
target_hidden_states
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
13130b89
...
...
@@ -28,7 +28,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
from
vllm.distributed.kv_transfer.kv_connector.v1
import
KVConnectorBase_V1
from
vllm.distributed.parallel_state
import
(
get_pp_group
,
get_tp_group
,
graph_capture
,
is_global_first_rank
,
prepare_communication_buffer_for_model
)
prepare_communication_buffer_for_model
,
get_tensor_model_parallel_world_size
)
from
vllm.forward_context
import
(
DPMetadata
,
get_forward_context
,
set_forward_context
,
set_profilling
)
from
vllm.logger
import
init_logger
...
...
@@ -1267,7 +1268,7 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
or
envs
.
VLLM_ALL2ALL_BACKEND
==
'naive'
:
# Early exit.
return
0
,
None
...
...
@@ -1361,6 +1362,12 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
else
:
num_input_tokens
=
num_scheduled_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
...
...
@@ -1789,6 +1796,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
decoding
=
spec_decode_metadata
is
not
None
)
spec_token_ids
=
draft_token_ids
.
tolist
()
if
not
envs
.
VLLM_REJECT_SAMPLE_OPT
:
draft_token_ids
=
draft_result
else
:
...
...
@@ -1799,7 +1808,6 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
draft_probs
,
draft_req_ids
)
else
:
self
.
draft_probs
.
update
(
draft_probs
,
draft_req_ids
)
spec_token_ids
=
draft_token_ids
.
tolist
()
return
spec_token_ids
...
...
@@ -1920,6 +1928,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
time_after_load
-
time_before_load
)
prepare_communication_buffer_for_model
(
self
.
model
)
if
hasattr
(
self
,
"drafter"
):
prepare_communication_buffer_for_model
(
self
.
drafter
.
model
)
if
is_mixture_of_experts
(
self
.
model
)
and
self
.
parallel_config
.
enable_eplb
:
logger
.
info
(
"EPLB is enabled for model %s."
,
...
...
@@ -2092,6 +2103,12 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
is_profile
:
bool
=
False
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_tokens
=
round_up
(
num_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_tokens
)
num_tokens
+=
num_pad
...
...
@@ -2156,6 +2173,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
input_ids
=
None
inputs_embeds
=
self
.
inputs_embeds
[:
num_tokens
]
else
:
self
.
input_ids
[:
num_tokens
]
=
torch
.
randint
(
0
,
self
.
model_config
.
get_vocab_size
(),
(
num_tokens
,),
dtype
=
torch
.
int32
)
input_ids
=
self
.
input_ids
[:
num_tokens
]
inputs_embeds
=
None
if
self
.
uses_mrope
:
...
...
@@ -3183,6 +3202,12 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
else
:
num_input_tokens
=
num_scheduled_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
...
...
@@ -3608,16 +3633,17 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
if
not
envs
.
VLLM_REJECT_SAMPLE_OPT
:
draft_token_ids
=
draft_result
else
:
draft_req_ids
=
list
(
scheduler_output
.
num_scheduled_tokens
.
keys
())
draft_token_ids
,
draft_probs
=
draft_result
spec_token_ids
=
draft_token_ids
.
tolist
()
if
envs
.
VLLM_REJECT_SAMPLE_OPT
:
draft_req_ids
=
list
(
scheduler_output
.
num_scheduled_tokens
.
keys
())
if
self
.
draft_probs
is
None
:
self
.
draft_probs
=
DraftProbs
(
draft_probs
,
draft_req_ids
)
else
:
self
.
draft_probs
.
update
(
draft_probs
,
draft_req_ids
)
spec_token_ids
=
draft_token_ids
.
tolist
()
return
spec_token_ids
#TODO:稳定后使用GPUModelRunnerMTP替换GPUModelRunner
if
envs
.
VLLM_USE_ZERO_MTP
:
...
...
vllm/zero_overhead/v1/eagle.py
View file @
13130b89
...
...
@@ -10,6 +10,7 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata
from
vllm.v1.attention.backends.utils
import
CommonAttentionMetadata
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.spec_decode.eagle
import
PADDING_SLOT_ID
,
EagleProposer
from
vllm.utils
import
round_up
class
V1ZeroEagleProposer
(
EagleProposer
):
...
...
@@ -110,6 +111,13 @@ class V1ZeroEagleProposer(EagleProposer):
num_input_tokens
=
self
.
vllm_config
.
pad_for_cudagraph
(
num_tokens
)
else
:
num_input_tokens
=
num_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
vllm_config
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# copy inputs to buffer for cudagraph
self
.
positions
[:
num_tokens
]
=
target_positions
self
.
hidden_states
[:
num_tokens
]
=
target_hidden_states
...
...
vllm/zero_overhead/v1/gpu_model_runner.py
View file @
13130b89
...
...
@@ -441,6 +441,12 @@ class V1ZeroModelRunner(GPUModelRunner):
else
:
num_input_tokens
=
num_scheduled_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment