Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
60b37c6b
Commit
60b37c6b
authored
Jan 07, 2026
by
zhuwenwen
Browse files
remove USE_FUSED_RMS_QUANT and USE_FUSED_SILU_MUL_QUANT
parent
c964b9ad
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
17 additions
and
57 deletions
+17
-57
vllm/envs.py
vllm/envs.py
+0
-10
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+1
-11
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+1
-8
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+10
-10
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+5
-18
No files found.
vllm/envs.py
View file @
60b37c6b
...
...
@@ -270,8 +270,6 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_MOE_SUM
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_ALIGN
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
USE_FUSED_RMS_QUANT
:
bool
=
False
USE_FUSED_SILU_MUL_QUANT
:
bool
=
False
VLLM_USE_PD_SPLIT
:
bool
=
False
VLLM_USE_PP_SYNC
:
bool
=
False
VLLM_USE_PIECEWISE
:
bool
=
False
...
...
@@ -1726,14 +1724,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_MERGE_ATTN_STATES_OPT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_MERGE_ATTN_STATES_OPT"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use rmsquant fused op
"USE_FUSED_RMS_QUANT"
:
lambda
:
(
os
.
getenv
(
'USE_FUSED_RMS_QUANT'
,
'0'
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use silu_mul_quant fused op
"USE_FUSED_SILU_MUL_QUANT"
:
lambda
:
(
os
.
getenv
(
'USE_FUSED_SILU_MUL_QUANT'
,
'0'
).
lower
()
in
(
"true"
,
"1"
)),
# vLLM will split prefill and decode, not mix up
"VLLM_USE_PD_SPLIT"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_PD_SPLIT"
,
"False"
).
lower
()
in
...
...
vllm/model_executor/layers/linear.py
View file @
60b37c6b
...
...
@@ -1592,7 +1592,6 @@ class RowParallelLinear(LinearBase):
def
forward
(
self
,
input_
,
use_fused_silu_mul_quant
:
bool
|
None
=
False
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
Parameter
|
None
]:
if
self
.
input_is_parallel
:
input_parallel
=
input_
...
...
@@ -1607,16 +1606,7 @@ class RowParallelLinear(LinearBase):
# Only fuse bias add into GEMM for rank 0 (this ensures that
# bias will not get added more than once in TP>1 case)
bias_
=
None
if
(
self
.
tp_rank
>
0
or
self
.
skip_bias_add
)
else
self
.
bias
if
use_fused_silu_mul_quant
:
xq
,
xs
=
lm_fuse_silu_mul_quant
(
input_parallel
)
silu_quant_args
=
[
xq
,
xs
]
output_parallel
=
self
.
quant_method
.
apply
(
self
,
input_parallel
,
bias_
,
silu_quant_args
=
silu_quant_args
)
else
:
output_parallel
=
self
.
quant_method
.
apply
(
self
,
input_parallel
,
bias_
)
output_parallel
=
self
.
quant_method
.
apply
(
self
,
input_parallel
,
bias_
)
if
self
.
reduce_results
and
self
.
tp_size
>
1
:
output
=
tensor_model_parallel_all_reduce
(
output_parallel
)
...
...
vllm/model_executor/layers/quantization/slimquant_w4a8.py
View file @
60b37c6b
...
...
@@ -159,14 +159,7 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
,
silu_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
if
envs
.
USE_FUSED_RMS_QUANT
and
input_quant_args
is
not
None
:
assert
len
(
input_quant_args
)
==
2
x_q
,
x_scale
=
input_quant_args
elif
envs
.
USE_FUSED_SILU_MUL_QUANT
and
silu_quant_args
is
not
None
:
assert
len
(
silu_quant_args
)
==
2
x_q
,
x_scale
=
silu_quant_args
else
:
x_q
,
x_scale
=
per_token_quant_int8
(
x
)
x_q
,
x_scale
=
per_token_quant_int8
(
x
)
if
self
.
w8a8_strategy
==
1
:
m
=
x_q
.
shape
[
0
]
...
...
vllm/model_executor/model_loader/utils.py
View file @
60b37c6b
...
...
@@ -196,11 +196,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os
.
environ
[
'VLLM_USE_LIGHTOP'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_OPT_CAT"
):
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
if
model_config
.
quantization
in
{
"slimquant_w4a8"
,
"slimquant_w4a8_marlin"
,
"slimquant_compressed_tensors_marlin"
,
"compressed-tensors"
}:
if
not
envs
.
is_set
(
"USE_FUSED_RMS_QUANT"
):
os
.
environ
[
'USE_FUSED_RMS_QUANT'
]
=
'1'
if
not
envs
.
is_set
(
"USE_FUSED_SILU_MUL_QUANT"
):
os
.
environ
[
'USE_FUSED_SILU_MUL_QUANT'
]
=
'1'
#
if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
#
if not envs.is_set("USE_FUSED_RMS_QUANT"):
#
os.environ['USE_FUSED_RMS_QUANT'] = '1'
#
if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
#
os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
else
:
if
not
envs
.
is_set
(
"VLLM_USE_PD_SPLIT"
):
os
.
environ
[
'VLLM_USE_PD_SPLIT'
]
=
'1'
...
...
@@ -228,11 +228,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os
.
environ
[
'VLLM_USE_LIGHTOP'
]
=
'1'
if
not
envs
.
is_set
(
"VLLM_USE_OPT_CAT"
):
os
.
environ
[
'VLLM_USE_OPT_CAT'
]
=
'1'
if
model_config
.
quantization
in
{
"slimquant_w4a8"
,
"slimquant_w4a8_marlin"
,
"slimquant_compressed_tensors_marlin"
,
"compressed-tensors"
}:
if
not
envs
.
is_set
(
"USE_FUSED_RMS_QUANT"
):
os
.
environ
[
'USE_FUSED_RMS_QUANT'
]
=
'1'
if
not
envs
.
is_set
(
"USE_FUSED_SILU_MUL_QUANT"
):
os
.
environ
[
'USE_FUSED_SILU_MUL_QUANT'
]
=
'1'
#
if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
#
if not envs.is_set("USE_FUSED_RMS_QUANT"):
#
os.environ['USE_FUSED_RMS_QUANT'] = '1'
#
if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
#
os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
else
:
if
not
envs
.
is_set
(
"VLLM_USE_PD_SPLIT"
):
os
.
environ
[
'VLLM_USE_PD_SPLIT'
]
=
'1'
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
60b37c6b
...
...
@@ -232,24 +232,11 @@ class DeepseekV2MLP(nn.Module):
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
,
rms_weight
:
torch
.
Tensor
|
None
=
None
,
residual
:
torch
.
Tensor
|
None
=
None
,
update_hd
:
bool
|
None
=
False
):
if
envs
.
USE_FUSED_RMS_QUANT
:
gate_up
,
new_resi
,
_
=
self
.
gate_up_proj
(
x
,
rms_weight
,
residual
,
update_hd
=
update_hd
)
if
envs
.
USE_FUSED_SILU_MUL_QUANT
:
x
,
_
=
self
.
down_proj
(
gate_up
,
use_fused_silu_mul_quant
=
True
)
else
:
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
,
new_resi
else
:
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
DeepseekV2MoE
(
nn
.
Module
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment