Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
46e26bf1
Commit
46e26bf1
authored
Sep 30, 2025
by
王敏
Browse files
修复部分代码
parent
83f2f396
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
79 additions
and
66 deletions
+79
-66
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+68
-62
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+11
-4
No files found.
vllm/model_executor/layers/quantization/slimquant_w4a8.py
View file @
46e26bf1
...
@@ -158,11 +158,14 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
...
@@ -158,11 +158,14 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
,
silu_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
):
if
envs
.
USE_FUSED_RMS_QUANT
and
input_quant_args
is
not
None
:
if
envs
.
USE_FUSED_RMS_QUANT
and
input_quant_args
is
not
None
:
assert
len
(
input_quant_args
)
==
2
assert
len
(
input_quant_args
)
==
2
x_q
,
x_scale
=
input_quant_args
x_q
,
x_scale
=
input_quant_args
elif
envs
.
USE_FUSED_SILU_MUL_QUANT
and
silu_quant_args
is
not
None
:
x_q
,
x_scale
=
silu_quant_args
else
:
else
:
x_q
,
x_scale
=
per_token_quant_int8
(
x
)
x_q
,
x_scale
=
per_token_quant_int8
(
x
)
...
@@ -373,65 +376,68 @@ class SlimQuantW4A8Int8MoEMethod:
...
@@ -373,65 +376,68 @@ class SlimQuantW4A8Int8MoEMethod:
)
)
def
apply
(
# tp
def
apply
(
self
,
self
,
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
top_k
:
int
,
top_k
:
int
,
renormalize
:
bool
,
renormalize
:
bool
,
use_grouped_topk
:
bool
=
False
,
use_grouped_topk
:
bool
=
False
,
topk_group
:
Optional
[
int
]
=
None
,
topk_group
:
Optional
[
int
]
=
None
,
num_expert_group
:
Optional
[
int
]
=
None
,
num_expert_group
:
Optional
[
int
]
=
None
,
global_num_experts
:
int
=
-
1
,
global_num_experts
:
int
=
-
1
,
expert_map
:
Optional
[
torch
.
Tensor
]
=
None
,
expert_map
:
Optional
[
torch
.
Tensor
]
=
None
,
custom_routing_function
:
Optional
[
Callable
]
=
None
,
custom_routing_function
:
Optional
[
Callable
]
=
None
,
scoring_func
:
str
=
"softmax"
,
scoring_func
:
str
=
"softmax"
,
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
apply_router_weight_on_input
:
bool
=
False
,
apply_router_weight_on_input
:
bool
=
False
,
activation
:
str
=
"silu"
,
activation
:
str
=
"silu"
,
enable_eplb
:
bool
=
False
,
enable_eplb
:
bool
=
False
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
**
_
shared_output
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
**
_
from
vllm.model_executor.layers.fused_moe
import
fused_experts
)
->
torch
.
Tensor
:
if
enable_eplb
:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
raise
NotImplementedError
(
if
enable_eplb
:
"EPLB not supported for `SlimQuantW4A8Int8MoEMethod` yet."
)
raise
NotImplementedError
(
# Expert selection
"EPLB not supported for `SlimQuantW4A8Int8MoEMethod` yet."
)
topk_weights
,
topk_ids
=
FusedMoE
.
select_experts
(
# Expert selection
hidden_states
=
x
,
topk_weights
,
topk_ids
=
FusedMoE
.
select_experts
(
router_logits
=
router_logits
,
hidden_states
=
x
,
use_grouped_topk
=
use_grouped_topk
,
router_logits
=
router_logits
,
top_k
=
top_k
,
use_grouped_topk
=
use_grouped_topk
,
renormalize
=
renormalize
,
top_k
=
top_k
,
topk_group
=
topk_group
,
renormalize
=
renormalize
,
num_expert_group
=
num_expert_group
,
topk_group
=
topk_group
,
custom_routing_function
=
custom_routing_function
,
num_expert_group
=
num_expert_group
,
scoring_func
=
scoring_func
,
custom_routing_function
=
custom_routing_function
,
e_score_correction_bias
=
e_score_correction_bias
,
scoring_func
=
scoring_func
,
routed_scaling_factor
=
routed_scaling_factor
,
e_score_correction_bias
=
e_score_correction_bias
,
use_fused_gate
=
use_fused_gate
routed_scaling_factor
=
routed_scaling_factor
,
)
use_fused_gate
=
use_fused_gate
)
return
fused_experts
(
return
fused_experts
(
x
,
x
,
layer
.
w13_weight
,
layer
.
w13_weight
,
layer
.
w2_weight
,
layer
.
w2_weight
,
topk_weights
=
topk_weights
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
topk_ids
=
topk_ids
,
inplace
=
True
,
inplace
=
True
,
use_int4_w4a8
=
True
,
use_int4_w4a8
=
True
,
per_channel_quant
=
True
,
per_channel_quant
=
True
,
activation
=
activation
,
activation
=
activation
,
expert_map
=
expert_map
,
expert_map
=
expert_map
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
global_num_experts
=
global_num_experts
,
global_num_experts
=
global_num_experts
,
w1_scale
=
(
layer
.
w13_weight_scale
),
w1_scale
=
(
layer
.
w13_weight_scale
),
w2_scale
=
(
layer
.
w2_weight_scale
),
w2_scale
=
(
layer
.
w2_weight_scale
),
a1_scale
=
layer
.
w13_input_scale
,
a1_scale
=
layer
.
w13_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
use_nn_moe
=
use_nn_moe
,
use_nn_moe
=
use_nn_moe
,
)
shared_output
=
shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
vllm/model_executor/models/deepseek_v2.py
View file @
46e26bf1
...
@@ -103,8 +103,12 @@ class DeepseekV2MLP(nn.Module):
...
@@ -103,8 +103,12 @@ class DeepseekV2MLP(nn.Module):
):
):
if
envs
.
USE_FUSED_RMS_QUANT
:
if
envs
.
USE_FUSED_RMS_QUANT
:
gate_up
,
new_resi
,
_
=
self
.
gate_up_proj
(
x
,
rms_weight
,
residual
,
update_hd
=
update_hd
)
gate_up
,
new_resi
,
_
=
self
.
gate_up_proj
(
x
,
rms_weight
,
residual
,
update_hd
=
update_hd
)
x
=
self
.
act_fn
(
gate_up
)
if
envs
.
USE_FUSED_SILU_MUL_QUANT
:
x
,
_
=
self
.
down_proj
(
x
)
x
,
_
=
self
.
down_proj
(
gate_up
,
use_fused_silu_mul_quant
=
True
)
else
:
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
,
new_resi
return
x
,
new_resi
else
:
else
:
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
...
@@ -574,7 +578,10 @@ class DeepseekV2MLAAttention(nn.Module):
...
@@ -574,7 +578,10 @@ class DeepseekV2MLAAttention(nn.Module):
kv_c
,
k_pe
=
self
.
kv_a_proj_with_mqa
(
hidden_states
,
quant_args
=
input_quant_args
,
update_hd
=
False
)[
0
].
split
(
kv_c
,
k_pe
=
self
.
kv_a_proj_with_mqa
(
hidden_states
,
quant_args
=
input_quant_args
,
update_hd
=
False
)[
0
].
split
(
[
self
.
kv_lora_rank
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
[
self
.
kv_lora_rank
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
kv_c_normed
=
self
.
kv_a_layernorm
(
kv_c
.
contiguous
())
if
envs
.
VLLM_USE_LIGHTOP
:
kv_c_normed
=
self
.
kv_a_layernorm
.
forward_cuda_opt
(
kv_c
)
else
:
kv_c_normed
=
self
.
kv_a_layernorm
(
kv_c
.
contiguous
())
q
=
q
.
view
(
-
1
,
self
.
num_local_heads
,
self
.
qk_head_dim
)
q
=
q
.
view
(
-
1
,
self
.
num_local_heads
,
self
.
qk_head_dim
)
# Add head dim of 1 to k_pe
# Add head dim of 1 to k_pe
...
@@ -1160,4 +1167,4 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
...
@@ -1160,4 +1167,4 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
for
i
in
range
(
config
.
num_nextn_predict_layers
):
for
i
in
range
(
config
.
num_nextn_predict_layers
):
if
weight_name
.
startswith
(
f
"model.layers.
{
layer_idx
+
i
}
."
):
if
weight_name
.
startswith
(
f
"model.layers.
{
layer_idx
+
i
}
."
):
return
layer_idx
+
i
return
layer_idx
+
i
return
None
return
None
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment