Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
13130b89
"csrc/quantization/w8a8/fp8/amd/quant_utils.cuh" did not exist on "3dcb3e8b9838cbbef83ce326b1a35b31a3cf14f2"
Commit
13130b89
authored
Dec 18, 2025
by
王敏
Browse files
[feat]合入基于deepep的大EP
parent
06106338
Changes
27
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
303 additions
and
89 deletions
+303
-89
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+14
-0
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+226
-83
vllm/model_executor/parameter.py
vllm/model_executor/parameter.py
+9
-0
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+8
-0
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+32
-6
vllm/zero_overhead/v1/eagle.py
vllm/zero_overhead/v1/eagle.py
+8
-0
vllm/zero_overhead/v1/gpu_model_runner.py
vllm/zero_overhead/v1/gpu_model_runner.py
+6
-0
No files found.
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
13130b89
...
...
@@ -39,6 +39,20 @@ def get_w8a8_int8_marlin_weights(
return
weight
def
w8a8_nt_kpack2_marlin_weight
(
w8a8_w
,
# [size_n, size_k// 2 ]
k_tile
=
16
,
n_tile
=
16
,
):
assert
w8a8_w
.
dtype
==
torch
.
int8
,
"w8a8_w 必须是 int8 类型"
size_n
,
size_k
=
w8a8_w
.
shape
assert
size_n
%
k_tile
==
0
and
size_k
%
n_tile
==
0
,
"k_tile / n_tile 必须能整除对应维度"
w8a8_w
=
w8a8_w
.
reshape
((
size_n
//
n_tile
,
n_tile
,
size_k
//
k_tile
,
k_tile
))
w8a8_w
=
w8a8_w
.
permute
((
0
,
2
,
1
,
3
)).
contiguous
()
w8a8_w
=
w8a8_w
.
reshape
((
size_n
//
k_tile
,
size_k
*
k_tile
))
return
w8a8_w
def
sparse_cutlass_supported
()
->
bool
:
if
not
current_platform
.
is_cuda
():
return
False
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
13130b89
...
...
@@ -40,9 +40,14 @@ from vllm.compilation.decorators import support_torch_compile
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
VllmConfig
,
get_current_vllm_config
)
from
vllm.distributed
import
(
get_ep_group
,
get_pp_group
,
get_dp_group
,
get_tensor_model_parallel_world_size
)
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_gather
,
get_tensor_model_parallel_rank
,
tensor_model_parallel_reduce_scatter
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
,
SharedFusedMoE
from
vllm.model_executor.layers.fused_moe.utils
import
EPSharedExperts
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
...
...
@@ -175,6 +180,14 @@ class DeepseekV2MoE(nn.Module):
self
.
physical_expert_end
=
(
self
.
physical_expert_start
+
self
.
n_local_physical_experts
)
dp_size
=
get_dp_group
().
world_size
self
.
use_mori_ep
=
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
envs
.
VLLM_ALL2ALL_BACKEND
==
'mori'
self
.
enable_expert_parallel
=
parallel_config
.
enable_expert_parallel
self
.
use_deepep
=
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
and
\
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
or
\
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
)
if
not
self
.
use_deepep
:
self
.
experts
=
FusedMoE
(
num_experts
=
config
.
n_routed_experts
,
top_k
=
config
.
num_experts_per_tok
,
...
...
@@ -205,6 +218,36 @@ class DeepseekV2MoE(nn.Module):
),
prefix
=
f
"
{
prefix
}
.shared_experts"
,
)
else
:
if
config
.
n_shared_experts
is
not
None
:
intermediate_size
=
(
config
.
moe_intermediate_size
*
config
.
n_shared_experts
)
self
.
shared_experts
=
EPSharedExperts
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
reduce_results
=
False
,
prefix
=
f
"
{
prefix
}
.shared_experts"
,
)
self
.
experts
=
SharedFusedMoE
(
num_experts
=
config
.
n_routed_experts
,
top_k
=
config
.
num_experts_per_tok
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
moe_intermediate_size
,
reduce_results
=
False
,
renormalize
=
config
.
norm_topk_prob
,
quant_config
=
quant_config
,
use_grouped_topk
=
True
,
num_expert_group
=
config
.
n_group
,
topk_group
=
config
.
topk_group
,
prefix
=
f
"
{
prefix
}
.experts"
,
scoring_func
=
config
.
scoring_func
,
e_score_correction_bias
=
self
.
gate
.
e_score_correction_bias
,
enable_eplb
=
self
.
enable_eplb
,
num_redundant_experts
=
self
.
n_redundant_experts
,
routed_scaling_factor
=
self
.
routed_scaling_factor
,
shared_experts
=
self
.
shared_experts
)
from
vllm.two_batch_overlap.two_batch_overlap
import
tbo_all_reduce
self
.
tbo_all_reduce
=
tbo_all_reduce
...
...
@@ -215,9 +258,10 @@ class DeepseekV2MoE(nn.Module):
xqxs
:
Optional
[
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]]
=
None
)
->
Union
[
torch
.
Tensor
,
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]]:
if
envs
.
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT
and
xqxs
is
not
None
:
num_tokens
,
hidden_dim
=
hidden_states
.
shape
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_dim
)
if
envs
.
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT
and
xqxs
is
not
None
:
if
self
.
n_shared_experts
is
not
None
:
shared_output
=
self
.
shared_experts
(
hidden_states
,
xqxs
=
xqxs
)
...
...
@@ -257,8 +301,7 @@ class DeepseekV2MoE(nn.Module):
final_hidden_states
))
return
final_hidden_states
.
view
(
num_tokens
,
hidden_dim
)
else
:
num_tokens
,
hidden_dim
=
hidden_states
.
shape
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_dim
)
if
not
self
.
enable_expert_parallel
:
i_q
,
i_s
=
None
,
None
if
self
.
n_shared_experts
is
not
None
:
if
envs
.
USE_FUSED_RMS_QUANT
:
...
...
@@ -268,7 +311,6 @@ class DeepseekV2MoE(nn.Module):
router_logits
,
_
=
self
.
gate
(
hidden_states
)
if
envs
.
VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
...
...
@@ -296,6 +338,40 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
else
:
router_logits
,
_
=
self
.
gate
(
hidden_states
)
if
self
.
use_deepep
:
shared_output
,
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
final_hidden_states
+
shared_output
else
:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
else
:
if
self
.
n_shared_experts
is
not
None
:
if
envs
.
USE_FUSED_RMS_QUANT
:
shared_output
,
new_resi
=
self
.
shared_experts
(
hidden_states
,
rms_weight
,
residual
,
update_hd
=
True
)
else
:
shared_output
=
self
.
shared_experts
(
hidden_states
)
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
final_hidden_states
+
shared_output
else
:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
if
self
.
tp_size
>
1
:
if
envs
.
VLLM_ENABLE_TBO
:
...
...
@@ -336,6 +412,7 @@ class DeepseekV2Attention(nn.Module):
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
reduce_results
:
bool
=
True
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
...
...
@@ -394,7 +471,8 @@ class DeepseekV2Attention(nn.Module):
self
.
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.o_proj"
)
prefix
=
f
"
{
prefix
}
.o_proj"
,
reduce_results
=
reduce_results
)
if
rope_scaling
:
rope_scaling
[
"rope_type"
]
=
'deepseek_yarn'
...
...
@@ -488,6 +566,7 @@ class DeepseekV2MLAAttention(nn.Module):
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
reduce_results
:
bool
=
True
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
...
...
@@ -565,7 +644,8 @@ class DeepseekV2MLAAttention(nn.Module):
self
.
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.o_proj"
)
prefix
=
f
"
{
prefix
}
.o_proj"
,
reduce_results
=
reduce_results
)
if
rope_scaling
:
rope_scaling
[
"rope_type"
]
=
'deepseek_yarn'
...
...
@@ -803,6 +883,44 @@ class DeepseekV2DecoderLayer(nn.Module):
# with the layer's index.
layer_idx
=
int
(
prefix
.
split
(
sep
=
'.'
)[
-
1
])
self
.
layer_idx
=
layer_idx
self
.
dp_size
=
get_dp_group
().
world_size
vllm_config
=
get_current_vllm_config
()
parallel_config
=
vllm_config
.
parallel_config
self
.
use_deepep
=
self
.
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
and
\
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
or
\
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
)
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
config
=
config
self
.
tp_rank
=
get_tensor_model_parallel_rank
()
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekV2MoE
(
config
=
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
enable_eplb
=
enable_eplb
,
)
else
:
self
.
mlp
=
DeepseekV2MLP
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
self
.
is_mtp_layer
=
False
if
self
.
layer_idx
==
config
.
num_hidden_layers
:
self
.
is_mtp_layer
=
True
reduce_results
=
True
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
\
self
.
tp_size
>
1
and
not
self
.
is_mtp_layer
:
reduce_results
=
False
if
model_config
.
use_mla
:
attn_cls
=
DeepseekV2MLAAttention
else
:
...
...
@@ -823,25 +941,9 @@ class DeepseekV2DecoderLayer(nn.Module):
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.self_attn"
,
reduce_results
=
reduce_results
)
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekV2MoE
(
config
=
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
enable_eplb
=
enable_eplb
,
)
else
:
self
.
mlp
=
DeepseekV2MLP
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
RMSNorm
(
config
.
hidden_size
,
...
...
@@ -850,6 +952,8 @@ class DeepseekV2DecoderLayer(nn.Module):
self
.
use_fused_rms_quant
=
envs
.
USE_FUSED_RMS_QUANT
self
.
use_fused_custom_all_reduce
=
envs
.
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT
def
forward_fused_rmsquant
(
self
,
positions
:
torch
.
Tensor
,
...
...
@@ -956,11 +1060,27 @@ class DeepseekV2DecoderLayer(nn.Module):
else
:
hidden_states
,
residual
=
self
.
input_layernorm
(
hidden_states
,
residual
)
if
not
self
.
is_mtp_layer
:
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
and
\
self
.
layer_idx
>
self
.
config
.
first_k_dense_replace
:
hidden_states
=
tensor_model_parallel_all_gather
(
hidden_states
,
dim
=
0
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
)
if
not
self
.
is_mtp_layer
:
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
:
if
self
.
layer_idx
==
self
.
config
.
first_k_dense_replace
:
residual
=
residual
.
tensor_split
(
self
.
tp_size
)[
self
.
tp_rank
]
hidden_states
=
tensor_model_parallel_reduce_scatter
(
hidden_states
,
dim
=
0
)
if
hidden_states
.
dtype
==
torch
.
float16
:
# Fix FP16 overflow
# We scale both hidden_states and residual before
...
...
@@ -974,8 +1094,19 @@ class DeepseekV2DecoderLayer(nn.Module):
# Fully Connected
hidden_states
,
residual
=
self
.
post_attention_layernorm
(
hidden_states
,
residual
)
if
self
.
is_mtp_layer
:
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
:
hidden_states
=
hidden_states
.
tensor_split
(
self
.
tp_size
)[
self
.
tp_rank
]
hidden_states
=
self
.
mlp
(
hidden_states
)
if
self
.
is_mtp_layer
:
if
isinstance
(
self
.
mlp
,
DeepseekV2MoE
)
and
self
.
use_deepep
and
self
.
tp_size
>
1
:
hidden_states
=
tensor_model_parallel_all_gather
(
hidden_states
,
dim
=
0
)
if
isinstance
(
self
.
mlp
,
DeepseekV2MLP
)
and
hidden_states
.
dtype
==
torch
.
float16
:
# Fix FP16 overflow
...
...
@@ -1052,6 +1183,14 @@ class DeepseekV2Model(nn.Module):
make_empty_intermediate_tensors_factory
(
[
"hidden_states"
,
"residual"
],
config
.
hidden_size
))
self
.
dp_size
=
get_dp_group
().
world_size
vllm_config
=
get_current_vllm_config
()
parallel_config
=
vllm_config
.
parallel_config
self
.
use_deepep
=
self
.
dp_size
>
1
and
parallel_config
.
enable_expert_parallel
and
\
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
or
\
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
)
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
...
...
@@ -1083,6 +1222,10 @@ class DeepseekV2Model(nn.Module):
})
hidden_states
,
_
=
self
.
norm
(
hidden_states
,
residual
)
if
self
.
use_deepep
and
self
.
tp_size
>
1
:
hidden_states
=
tensor_model_parallel_all_gather
(
hidden_states
,
dim
=
0
)
return
hidden_states
...
...
vllm/model_executor/parameter.py
View file @
13130b89
...
...
@@ -96,6 +96,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
def
__init__
(
self
,
output_dim
:
int
,
**
kwargs
):
self
.
_output_dim
=
output_dim
super
().
__init__
(
**
kwargs
)
self
.
expect_tp_size
=
-
1
@
property
def
output_dim
(
self
):
...
...
@@ -103,6 +105,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
def
load_column_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
):
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
shard_size
=
self
.
data
.
shape
[
self
.
output_dim
]
loaded_weight
=
loaded_weight
.
narrow
(
self
.
output_dim
,
tp_rank
*
shard_size
,
shard_size
)
...
...
@@ -123,6 +127,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
param_data
=
self
.
data
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
param_data
=
param_data
.
narrow
(
self
.
output_dim
,
shard_offset
,
shard_size
)
loaded_weight
=
loaded_weight
.
narrow
(
self
.
output_dim
,
...
...
@@ -167,6 +173,7 @@ class RowvLLMParameter(BasevLLMParameter):
def
__init__
(
self
,
input_dim
:
int
,
**
kwargs
):
self
.
_input_dim
=
input_dim
super
().
__init__
(
**
kwargs
)
self
.
expect_tp_size
=
-
1
@
property
def
input_dim
(
self
):
...
...
@@ -174,6 +181,8 @@ class RowvLLMParameter(BasevLLMParameter):
def
load_row_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
):
tp_rank
=
get_tensor_model_parallel_rank
()
if
self
.
expect_tp_size
==
1
:
tp_rank
=
0
shard_size
=
self
.
data
.
shape
[
self
.
input_dim
]
loaded_weight
=
loaded_weight
.
narrow
(
self
.
input_dim
,
tp_rank
*
shard_size
,
shard_size
)
...
...
vllm/v1/spec_decode/eagle.py
View file @
13130b89
...
...
@@ -25,6 +25,7 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata, MLACommonDe
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.spec_decode.utils
import
prepare_eagle_input_kernel
from
vllm.utils
import
round_up
logger
=
init_logger
(
__name__
)
...
...
@@ -186,6 +187,13 @@ class EagleProposer:
num_input_tokens
=
self
.
vllm_config
.
pad_for_cudagraph
(
num_tokens
)
else
:
num_input_tokens
=
num_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
vllm_config
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# copy inputs to buffer for cudagraph
self
.
positions
[:
num_tokens
]
=
target_positions
self
.
hidden_states
[:
num_tokens
]
=
target_hidden_states
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
13130b89
...
...
@@ -28,7 +28,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
from
vllm.distributed.kv_transfer.kv_connector.v1
import
KVConnectorBase_V1
from
vllm.distributed.parallel_state
import
(
get_pp_group
,
get_tp_group
,
graph_capture
,
is_global_first_rank
,
prepare_communication_buffer_for_model
)
prepare_communication_buffer_for_model
,
get_tensor_model_parallel_world_size
)
from
vllm.forward_context
import
(
DPMetadata
,
get_forward_context
,
set_forward_context
,
set_profilling
)
from
vllm.logger
import
init_logger
...
...
@@ -1267,7 +1268,7 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
:
if
dp_size
==
1
or
self
.
vllm_config
.
model_config
.
enforce_eager
or
envs
.
VLLM_ALL2ALL_BACKEND
==
'naive'
:
# Early exit.
return
0
,
None
...
...
@@ -1361,6 +1362,12 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
else
:
num_input_tokens
=
num_scheduled_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
...
...
@@ -1789,6 +1796,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
decoding
=
spec_decode_metadata
is
not
None
)
spec_token_ids
=
draft_token_ids
.
tolist
()
if
not
envs
.
VLLM_REJECT_SAMPLE_OPT
:
draft_token_ids
=
draft_result
else
:
...
...
@@ -1799,7 +1808,6 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
draft_probs
,
draft_req_ids
)
else
:
self
.
draft_probs
.
update
(
draft_probs
,
draft_req_ids
)
spec_token_ids
=
draft_token_ids
.
tolist
()
return
spec_token_ids
...
...
@@ -1920,6 +1928,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
time_after_load
-
time_before_load
)
prepare_communication_buffer_for_model
(
self
.
model
)
if
hasattr
(
self
,
"drafter"
):
prepare_communication_buffer_for_model
(
self
.
drafter
.
model
)
if
is_mixture_of_experts
(
self
.
model
)
and
self
.
parallel_config
.
enable_eplb
:
logger
.
info
(
"EPLB is enabled for model %s."
,
...
...
@@ -2092,6 +2103,12 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
is_profile
:
bool
=
False
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_tokens
=
round_up
(
num_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_tokens
)
num_tokens
+=
num_pad
...
...
@@ -2156,6 +2173,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
input_ids
=
None
inputs_embeds
=
self
.
inputs_embeds
[:
num_tokens
]
else
:
self
.
input_ids
[:
num_tokens
]
=
torch
.
randint
(
0
,
self
.
model_config
.
get_vocab_size
(),
(
num_tokens
,),
dtype
=
torch
.
int32
)
input_ids
=
self
.
input_ids
[:
num_tokens
]
inputs_embeds
=
None
if
self
.
uses_mrope
:
...
...
@@ -3183,6 +3202,12 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
else
:
num_input_tokens
=
num_scheduled_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
...
...
@@ -3608,16 +3633,17 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
if
not
envs
.
VLLM_REJECT_SAMPLE_OPT
:
draft_token_ids
=
draft_result
else
:
draft_req_ids
=
list
(
scheduler_output
.
num_scheduled_tokens
.
keys
())
draft_token_ids
,
draft_probs
=
draft_result
spec_token_ids
=
draft_token_ids
.
tolist
()
if
envs
.
VLLM_REJECT_SAMPLE_OPT
:
draft_req_ids
=
list
(
scheduler_output
.
num_scheduled_tokens
.
keys
())
if
self
.
draft_probs
is
None
:
self
.
draft_probs
=
DraftProbs
(
draft_probs
,
draft_req_ids
)
else
:
self
.
draft_probs
.
update
(
draft_probs
,
draft_req_ids
)
spec_token_ids
=
draft_token_ids
.
tolist
()
return
spec_token_ids
#TODO:稳定后使用GPUModelRunnerMTP替换GPUModelRunner
if
envs
.
VLLM_USE_ZERO_MTP
:
...
...
vllm/zero_overhead/v1/eagle.py
View file @
13130b89
...
...
@@ -10,6 +10,7 @@ from vllm.v1.attention.backends.mla.common import MLACommonMetadata
from
vllm.v1.attention.backends.utils
import
CommonAttentionMetadata
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.spec_decode.eagle
import
PADDING_SLOT_ID
,
EagleProposer
from
vllm.utils
import
round_up
class
V1ZeroEagleProposer
(
EagleProposer
):
...
...
@@ -110,6 +111,13 @@ class V1ZeroEagleProposer(EagleProposer):
num_input_tokens
=
self
.
vllm_config
.
pad_for_cudagraph
(
num_tokens
)
else
:
num_input_tokens
=
num_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
vllm_config
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# copy inputs to buffer for cudagraph
self
.
positions
[:
num_tokens
]
=
target_positions
self
.
hidden_states
[:
num_tokens
]
=
target_hidden_states
...
...
vllm/zero_overhead/v1/gpu_model_runner.py
View file @
13130b89
...
...
@@ -441,6 +441,12 @@ class V1ZeroModelRunner(GPUModelRunner):
else
:
num_input_tokens
=
num_scheduled_tokens
# make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
tp_size
=
self
.
vllm_config
.
parallel_config
.
tensor_parallel_size
if
self
.
parallel_config
.
enable_expert_parallel
and
dp_size
>
1
and
tp_size
>
1
:
num_input_tokens
=
round_up
(
num_input_tokens
,
tp_size
)
# Padding for DP
num_pad
,
num_tokens_across_dp
=
self
.
get_dp_padding
(
num_input_tokens
)
num_input_tokens
+=
num_pad
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment