Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
216e414b
Commit
216e414b
authored
Nov 21, 2025
by
wujl5
Committed by
zhuwenwen
Nov 21, 2025
Browse files
deepseek_v2_w8a8 增加 silu_mul_quant融合
parent
2b47bce9
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
21 additions
and
9 deletions
+21
-9
vllm/envs.py
vllm/envs.py
+2
-3
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+6
-2
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
...ompressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+5
-2
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+1
-0
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+7
-2
No files found.
vllm/envs.py
View file @
216e414b
...
...
@@ -172,7 +172,7 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_MOE_SUM
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_ALIGN
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
USE_FUSED_RMS_QUANT
:
bool
=
Fals
e
USE_FUSED_RMS_QUANT
:
bool
=
Tru
e
USE_FUSED_SILU_MUL_QUANT
:
bool
=
True
VLLM_P2P_ASYNC
:
bool
=
False
VLLM_P2P_BUF_TOKENS
:
int
=
30000
...
...
@@ -1142,8 +1142,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
(
"true"
,
"1"
)),
# vllm will use rmsquant fused op
"USE_FUSED_RMS_QUANT"
:
lambda
:
(
os
.
getenv
(
'USE_FUSED_RMS_QUANT'
,
'0'
).
lower
()
in
(
"true"
,
"1"
)),
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_RMS_QUANT"
,
"1"
))),
# vllm will use silu_mul_quant fused op,
# This variable has a default value of true,
# but it is still controlled by CRQ and RQ.
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
216e414b
...
...
@@ -669,7 +669,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
,
silu_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
"""
Use the output of create_weights and the CompressedTensorsScheme
associated with the layer to apply the forward pass with the
...
...
@@ -680,7 +681,10 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
scheme
=
layer
.
scheme
if
scheme
is
None
:
raise
ValueError
(
"A scheme must be defined for each layer"
)
return
scheme
.
apply_weights
(
layer
,
x
,
bias
=
bias
,
input_quant_args
=
input_quant_args
)
return
scheme
.
apply_weights
(
layer
,
x
,
bias
=
bias
,
input_quant_args
=
input_quant_args
,
silu_quant_args
=
silu_quant_args
)
class
CompressedTensorsKVCacheMethod
(
BaseKVCacheMethod
):
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
View file @
216e414b
...
...
@@ -112,7 +112,9 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
def
apply_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
],
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
)
->
torch
.
Tensor
:
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
,
silu_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
)
->
torch
.
Tensor
:
# return self.kernel.apply_weights(layer, x, bias)
...
...
@@ -124,4 +126,5 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
azp_adj
=
layer
.
azp_adj
,
bias
=
bias
,
w8a8_strategy
=
self
.
w8a8_strategy
,
input_quant_args
=
input_quant_args
)
\ No newline at end of file
input_quant_args
=
input_quant_args
,
silu_quant_args
=
silu_quant_args
)
\ No newline at end of file
vllm/model_executor/layers/quantization/slimquant_w4a8.py
View file @
216e414b
...
...
@@ -168,6 +168,7 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
assert
len
(
input_quant_args
)
==
2
x_q
,
x_scale
=
input_quant_args
elif
envs
.
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT
and
silu_quant_args
is
not
None
:
assert
len
(
silu_quant_args
)
==
2
x_q
,
x_scale
=
silu_quant_args
else
:
x_q
,
x_scale
=
per_token_quant_int8
(
x
)
...
...
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
216e414b
...
...
@@ -406,7 +406,8 @@ def apply_int8_linear(
azp_adj
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
w8a8_strategy
:
Optional
[
int
]
=
0
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
,
silu_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
# ops.scaled_int8_quant supports both dynamic and static quant.
# * dynamic, layer.input_scale is None and x_scale computed from x.
...
...
@@ -416,7 +417,11 @@ def apply_int8_linear(
assert
len
(
input_quant_args
)
==
2
x_zp
=
None
x_q
,
x_scale
=
input_quant_args
else
:
# not USE_FUSED_RMS_QUANT
elif
envs
.
USE_FUSED_RMS_QUANT
and
silu_quant_args
is
not
None
:
assert
len
(
silu_quant_args
)
==
2
x_zp
=
None
x_q
,
x_scale
=
silu_quant_args
else
:
# default
symmetric
=
azp_adj
is
None
if
input_scale
is
None
and
input_zero_point
is
None
and
symmetric
is
True
:
x_q
,
x_scale
=
per_token_quant_int8
(
input
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment