Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dbb27c0e
Commit
dbb27c0e
authored
Sep 23, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev' into v0.9.2-dev-ds
parents
7904da3f
3320343d
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
46 additions
and
15 deletions
+46
-15
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
...ted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+4
-3
vllm/envs.py
vllm/envs.py
+10
-3
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+1
-1
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+21
-5
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+4
-1
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+6
-2
No files found.
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
View file @
dbb27c0e
...
...
@@ -289,9 +289,10 @@ class P2pNcclConnector(KVConnectorBase_V1):
kv_cache
,
remote_address
)
def
wait_for_save
(
self
):
if
self
.
is_producer
:
assert
self
.
p2p_nccl_engine
is
not
None
self
.
p2p_nccl_engine
.
wait_for_sent
()
pass
# if self.is_producer:
# assert self.p2p_nccl_engine is not None
# self.p2p_nccl_engine.wait_for_sent()
def
get_finished
(
self
,
finished_req_ids
:
set
[
str
],
...
...
vllm/envs.py
View file @
dbb27c0e
...
...
@@ -168,7 +168,8 @@ if TYPE_CHECKING:
VLLM_USE_TRITON_CAT
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
USE_FUSED_RMS_QUANT
:
bool
=
False
VLLM_USE_DEEPSEEK_MOE_SUM_MUL_AND
:
bool
=
False
VLLM_USE_DEEPSEEK_MOE_SUM_MUL_ADD
:
bool
=
False
USE_FUSED_SILU_MUL_QUANT
:
bool
=
False
def
get_default_cache_root
():
return
os
.
getenv
(
...
...
@@ -1110,9 +1111,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
"USE_FUSED_RMS_QUANT"
:
lambda
:
(
os
.
getenv
(
'USE_FUSED_RMS_QUANT'
,
'0'
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use lightop's moe_sum fusion operator for deepseek
"VLLM_USE_DEEPSEEK_MOE_SUM_MUL_AND"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_DEEPSEEK_MOE_SUM_MUL_AND'
,
'True'
).
lower
()
in
"VLLM_USE_DEEPSEEK_MOE_SUM_MUL_ADD"
:
lambda
:
(
os
.
getenv
(
'VLLM_USE_DEEPSEEK_MOE_SUM_MUL_ADD'
,
'True'
).
lower
()
in
(
"true"
,
"1"
)),
# vllm will use silu_mul_quant fused op
"USE_FUSED_SILU_MUL_QUANT"
:
lambda
:
(
os
.
getenv
(
'USE_FUSED_SILU_MUL_QUANT'
,
'0'
).
lower
()
in
(
"true"
,
"1"
)),
}
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
dbb27c0e
...
...
@@ -1760,7 +1760,7 @@ def fused_experts_impl(
block_shape
=
block_shape
,
use_nn_moe
=
use_nn_moe
)
if
envs
.
VLLM_USE_DEEPSEEK_MOE_SUM_MUL_A
N
D
:
if
envs
.
VLLM_USE_DEEPSEEK_MOE_SUM_MUL_A
D
D
:
if
envs
.
VLLM_USE_LIGHT_OP
and
not
dpsk_fp16_quick
:
if
shared_output
is
not
None
:
op
.
moe_sum
(
intermediate_cache3
.
view
(
*
intermediate_cache3
.
size
()),
...
...
vllm/model_executor/layers/linear.py
View file @
dbb27c0e
...
...
@@ -38,6 +38,12 @@ if envs.USE_FUSED_RMS_QUANT:
from
lmslim.quantize.quant_ops
import
lm_faster_rmsquant
except
Exception
as
e
:
print
(
f
"Error: Import fused rmsquant error:
{
e
}
"
)
if
envs
.
USE_FUSED_SILU_MUL_QUANT
:
try
:
# from lightop import fuse_silu_mul_quant
from
lmslim.quantize.quant_ops
import
lm_fuse_silu_mul_quant
except
Exception
as
e
:
print
(
f
"Error: Import fused silu_mul_qunat error:
{
e
}
"
)
logger
=
init_logger
(
__name__
)
...
...
@@ -1488,7 +1494,8 @@ class RowParallelLinear(LinearBase):
param
.
load_row_parallel_weight
(
loaded_weight
=
loaded_weight
)
def
forward
(
self
,
input_
self
,
input_
,
use_fused_silu_mul_quant
:
Optional
[
bool
]
=
False
)
->
Union
[
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
Optional
[
Parameter
]]]:
if
self
.
input_is_parallel
:
input_parallel
=
input_
...
...
@@ -1503,6 +1510,15 @@ class RowParallelLinear(LinearBase):
# Only fuse bias add into GEMM for rank 0 (this ensures that
# bias will not get added more than once in TP>1 case)
bias_
=
None
if
(
self
.
tp_rank
>
0
or
self
.
skip_bias_add
)
else
self
.
bias
if
use_fused_silu_mul_quant
:
xq
,
xs
=
lm_fuse_silu_mul_quant
(
input_parallel
)
silu_quant_args
=
[
xq
,
xs
]
output_parallel
=
self
.
quant_method
.
apply
(
self
,
input_parallel
,
bias
=
bias_
,
silu_quant_args
=
silu_quant_args
)
else
:
output_parallel
=
self
.
quant_method
.
apply
(
self
,
input_parallel
,
bias
=
bias_
)
...
...
vllm/model_executor/layers/quantization/slimquant_w4a8.py
View file @
dbb27c0e
...
...
@@ -154,11 +154,14 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
,
silu_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
if
envs
.
USE_FUSED_RMS_QUANT
and
input_quant_args
is
not
None
:
assert
len
(
input_quant_args
)
==
2
x_q
,
x_scale
=
input_quant_args
elif
envs
.
USE_FUSED_SILU_MUL_QUANT
and
silu_quant_args
is
not
None
:
x_q
,
x_scale
=
silu_quant_args
else
:
x_q
,
x_scale
=
per_token_quant_int8
(
x
)
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
dbb27c0e
...
...
@@ -101,8 +101,12 @@ class DeepseekV2MLP(nn.Module):
):
if
envs
.
USE_FUSED_RMS_QUANT
:
gate_up
,
new_resi
,
_
=
self
.
gate_up_proj
(
x
,
rms_weight
,
residual
,
update_hd
=
update_hd
)
if
envs
.
USE_FUSED_SILU_MUL_QUANT
:
x
,
_
=
self
.
down_proj
(
gate_up
,
use_fused_silu_mul_quant
=
True
)
else
:
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
,
new_resi
else
:
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment