Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7293a072
Commit
7293a072
authored
Nov 01, 2025
by
王敏
Browse files
Merge remote-tracking branch 'origin/v0.9.2-dev-ds' into v0.9.2-dev-ds
# Conflicts: # vllm/model_executor/models/deepseek_v2.py
parents
98b7432a
db2c32b0
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
115 additions
and
55 deletions
+115
-55
vllm/attention/layer.py
vllm/attention/layer.py
+3
-3
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
...ted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+72
-17
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
...ibuted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+35
-16
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+5
-19
No files found.
vllm/attention/layer.py
View file @
7293a072
...
@@ -123,9 +123,9 @@ class Attention(nn.Module):
...
@@ -123,9 +123,9 @@ class Attention(nn.Module):
assert
isinstance
(
quant_method
,
BaseKVCacheMethod
)
assert
isinstance
(
quant_method
,
BaseKVCacheMethod
)
# TODO (mgoin): kv cache dtype should be specified in the FP8
# TODO (mgoin): kv cache dtype should be specified in the FP8
# checkpoint config and become the "auto" behavior
# checkpoint config and become the "auto" behavior
if
self
.
kv_cache_dtype
==
"fp8_e5m2"
:
#
if self.kv_cache_dtype == "fp8_e5m2":
raise
ValueError
(
"fp8_e5m2 kv-cache is not supported with "
#
raise ValueError("fp8_e5m2 kv-cache is not supported with "
"fp8 checkpoints."
)
#
"fp8 checkpoints.")
# If quantization is enabled, we make "k_scale" and "v_scale"
# If quantization is enabled, we make "k_scale" and "v_scale"
# parameters so that it can be loaded from the model checkpoint.
# parameters so that it can be loaded from the model checkpoint.
# The k/v_scale will then be converted back to native float32
# The k/v_scale will then be converted back to native float32
...
...
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
View file @
7293a072
...
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
...
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional
import
regex
as
re
import
regex
as
re
import
torch
import
torch
import
os
from
vllm
import
envs
from
vllm
import
envs
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
(
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
(
...
@@ -103,6 +104,35 @@ class P2pNcclConnector(KVConnectorBase_V1):
...
@@ -103,6 +104,35 @@ class P2pNcclConnector(KVConnectorBase_V1):
self
.
total_num_hidden_layers
=
getattr
(
self
.
model_config
.
hf_text_config
,
self
.
total_num_hidden_layers
=
getattr
(
self
.
model_config
.
hf_text_config
,
"num_hidden_layers"
,
0
)
"num_hidden_layers"
,
0
)
self
.
pp_size
=
self
.
parallel_config
.
pipeline_parallel_size
self
.
pp_size
=
self
.
parallel_config
.
pipeline_parallel_size
self
.
tp_size
=
self
.
parallel_config
.
tensor_parallel_size
self
.
num_card
=
self
.
pp_size
*
self
.
tp_size
self
.
multiple_machines
=
1
if
self
.
num_card
>
8
else
0
if
self
.
is_producer
and
self
.
multiple_machines
==
1
:
self
.
ip_map
=
{}
self
.
duplicate_keys
=
[]
config_file
=
os
.
getenv
(
'IP_CONFIG_FILE'
)
if
not
config_file
:
print
(
"Warning: Please set the IPVNet FILE environment variable for cross machine recognition of the second IP address"
)
return
try
:
with
open
(
config_file
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
for
line_num
,
line
in
enumerate
(
file
,
1
):
line
=
line
.
strip
()
if
line
and
not
line
.
startswith
(
'#'
):
ips
=
line
.
split
()
if
len
(
ips
)
==
2
:
first_ip
,
second_ip
=
ips
if
first_ip
not
in
self
.
ip_map
:
self
.
ip_map
[
first_ip
]
=
second_ip
else
:
print
(
f
"warning: num
{
line_num
}
Incorrect format :
{
line
}
"
)
except
Exception
as
e
:
print
(
f
"Error: Exception occurred while reading configuration file -
{
e
}
"
)
def
get_ip_value
(
self
,
key
):
return
self
.
ip_map
.
get
(
key
)
# ==============================
# ==============================
# Worker-side methods
# Worker-side methods
...
@@ -252,7 +282,13 @@ class P2pNcclConnector(KVConnectorBase_V1):
...
@@ -252,7 +282,13 @@ class P2pNcclConnector(KVConnectorBase_V1):
2
,
num_pages
*
page_size
,
-
1
)
2
,
num_pages
*
page_size
,
-
1
)
inject_start_index
=
0
inject_start_index
=
0
for
num
in
range
(
self
.
p2p_nccl_engine
.
tensor_split_num
):
req_layer
=
f
"
{
request
.
request_id
}
#
{
layer_name
}
"
with
self
.
p2p_nccl_engine
.
recv_store_cv
:
while
req_layer
not
in
self
.
p2p_nccl_engine
.
recv_split_nums
:
self
.
p2p_nccl_engine
.
recv_store_cv
.
wait
()
split_num
=
self
.
p2p_nccl_engine
.
recv_split_nums
.
get
(
req_layer
)
for
num
in
range
(
split_num
):
kv_cache
=
self
.
p2p_nccl_engine
.
recv_tensor
(
kv_cache
=
self
.
p2p_nccl_engine
.
recv_tensor
(
request
.
request_id
+
"#"
+
layer_name
+
"#"
+
str
(
num
))
request
.
request_id
+
"#"
+
layer_name
+
"#"
+
str
(
num
))
...
@@ -280,6 +316,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
...
@@ -280,6 +316,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
# inject_kv_into_layer(kv_cache_layer, kv_cache,
# inject_kv_into_layer(kv_cache_layer, kv_cache,
# request.slot_mapping, request.request_id)
# request.slot_mapping, request.request_id)
tensor_id
=
request
.
request_id
+
"#"
+
layer_name
+
"#"
+
str
(
num
)
tensor_id
=
request
.
request_id
+
"#"
+
layer_name
+
"#"
+
str
(
num
)
if
tensor_id
in
self
.
p2p_nccl_engine
.
recv_store
:
if
tensor_id
in
self
.
p2p_nccl_engine
.
recv_store
:
tensor
=
self
.
p2p_nccl_engine
.
recv_store
.
pop
(
tensor_id
,
None
)
tensor
=
self
.
p2p_nccl_engine
.
recv_store
.
pop
(
tensor_id
,
None
)
self
.
p2p_nccl_engine
.
send_request_id_to_tensor_ids
.
pop
(
self
.
p2p_nccl_engine
.
send_request_id_to_tensor_ids
.
pop
(
...
@@ -387,6 +424,24 @@ class P2pNcclConnector(KVConnectorBase_V1):
...
@@ -387,6 +424,24 @@ class P2pNcclConnector(KVConnectorBase_V1):
pp_rank
=
(
self
.
parallel_config
.
rank
//
self
.
parallel_config
.
tensor_parallel_size
pp_rank
=
(
self
.
parallel_config
.
rank
//
self
.
parallel_config
.
tensor_parallel_size
)
%
self
.
parallel_config
.
pipeline_parallel_size
)
%
self
.
parallel_config
.
pipeline_parallel_size
if
(
self
.
multiple_machines
):
ip_second
=
self
.
get_ip_value
(
ip
)
if
(
self
.
pp_size
==
1
):
if
self
.
_rank
<
8
:
self
.
p2p_nccl_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
kv_cache
,
remote_address
)
self
.
p2p_nccl_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
kv_cache
,
str
(
ip_second
)
+
":"
+
str
(
port
+
self
.
_rank
+
8
))
elif
(
self
.
pp_size
==
2
):
if
(
pp_rank
==
0
):
self
.
p2p_nccl_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
kv_cache
,
remote_address
)
else
:
self
.
p2p_nccl_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
kv_cache
,
str
(
ip_second
)
+
":"
+
str
(
port
+
self
.
_rank
))
else
:
print
(
"Error: only suppprt pp1 pp2 !!!!!!"
)
else
:
if
(
self
.
pp_size
==
1
):
if
(
self
.
pp_size
==
1
):
self
.
p2p_nccl_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
self
.
p2p_nccl_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
kv_cache
,
remote_address
)
kv_cache
,
remote_address
)
...
...
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
View file @
7293a072
...
@@ -117,6 +117,7 @@ class P2pNcclEngine:
...
@@ -117,6 +117,7 @@ class P2pNcclEngine:
self
.
p2p_async_kv_tokens
=
envs
.
VLLM_P2P_BUF_TOKENS
self
.
p2p_async_kv_tokens
=
envs
.
VLLM_P2P_BUF_TOKENS
self
.
p2p_async_buf
=
None
self
.
p2p_async_buf
=
None
self
.
tensor_split_num
:
int
=
0
self
.
tensor_split_num
:
int
=
0
self
.
recv_split_nums
:
dict
[
str
,
int
]
=
{}
mem_pool_size_gb
=
self
.
config
.
get_from_extra_config
(
mem_pool_size_gb
=
self
.
config
.
get_from_extra_config
(
"mem_pool_size_gb"
,
DEFAULT_MEM_POOL_SIZE_GB
)
"mem_pool_size_gb"
,
DEFAULT_MEM_POOL_SIZE_GB
)
...
@@ -200,7 +201,6 @@ class P2pNcclEngine:
...
@@ -200,7 +201,6 @@ class P2pNcclEngine:
tensor_id
:
str
,
tensor_id
:
str
,
tensor
:
torch
.
Tensor
,
tensor
:
torch
.
Tensor
,
remote_address
:
typing
.
Optional
[
str
]
=
None
,
remote_address
:
typing
.
Optional
[
str
]
=
None
,
tbo_evt
=
None
,
)
->
bool
:
)
->
bool
:
if
remote_address
is
None
:
if
remote_address
is
None
:
with
self
.
recv_store_cv
:
with
self
.
recv_store_cv
:
...
@@ -251,7 +251,8 @@ class P2pNcclEngine:
...
@@ -251,7 +251,8 @@ class P2pNcclEngine:
if
remote_address
is
None
:
if
remote_address
is
None
:
with
self
.
recv_store_cv
:
with
self
.
recv_store_cv
:
self
.
recv_store
[
tensor_id
]
=
tensor
self
.
recv_store
[
tensor_id
]
=
tensor
self
.
recv_store_cv
.
notify
()
# self.recv_store_cv.notify()
self
.
recv_store_cv
.
notify_all
()
return
True
return
True
else
:
else
:
if
self
.
send_type
==
"PUT"
:
if
self
.
send_type
==
"PUT"
:
...
@@ -260,7 +261,7 @@ class P2pNcclEngine:
...
@@ -260,7 +261,7 @@ class P2pNcclEngine:
with
self
.
send_queue_cv
:
with
self
.
send_queue_cv
:
kv_layer
,
slot_mapping
=
tensor
# tesor (kv_layer, slot_mapping)
kv_layer
,
slot_mapping
=
tensor
# tesor (kv_layer, slot_mapping)
self
.
send_queue
.
append
([
tensor_id
,
remote_address
,
kv_layer
,
slot_mapping
,
tbo_evt
])
self
.
send_queue
.
append
([
tensor_id
,
remote_address
,
kv_layer
,
slot_mapping
,
tbo_evt
])
self
.
send_queue_cv
.
notify
()
self
.
send_queue_cv
.
notify
_all
()
else
:
# GET
else
:
# GET
with
self
.
send_store_cv
:
with
self
.
send_store_cv
:
tensor_size
=
tensor
.
element_size
()
*
tensor
.
numel
()
tensor_size
=
tensor
.
element_size
()
*
tensor
.
numel
()
...
@@ -365,7 +366,14 @@ class P2pNcclEngine:
...
@@ -365,7 +366,14 @@ class P2pNcclEngine:
elif
data
[
"cmd"
]
==
"PUT"
:
elif
data
[
"cmd"
]
==
"PUT"
:
tensor_id
=
data
[
"tensor_id"
]
tensor_id
=
data
[
"tensor_id"
]
if
"tensor_split_num"
in
data
:
if
"tensor_split_num"
in
data
:
self
.
tensor_split_num
=
data
[
"tensor_split_num"
]
# self.tensor_split_num = data["tensor_split_num"]
parts
=
tensor_id
.
split
(
'#'
)
request_id
=
parts
[
0
]
layer_name
=
parts
[
1
]
req_layer
=
f
"
{
request_id
}
#
{
layer_name
}
"
self
.
recv_split_nums
[
req_layer
]
=
data
[
"tensor_split_num"
]
with
self
.
recv_store_cv
:
self
.
recv_store_cv
.
notify_all
()
try
:
try
:
with
torch
.
cuda
.
stream
(
self
.
recv_stream
):
with
torch
.
cuda
.
stream
(
self
.
recv_stream
):
tensor
=
torch
.
empty
(
data
[
"shape"
],
tensor
=
torch
.
empty
(
data
[
"shape"
],
...
@@ -397,7 +405,8 @@ class P2pNcclEngine:
...
@@ -397,7 +405,8 @@ class P2pNcclEngine:
with
self
.
recv_store_cv
:
with
self
.
recv_store_cv
:
self
.
recv_store
[
tensor_id
]
=
tensor
self
.
recv_store
[
tensor_id
]
=
tensor
self
.
_have_received_tensor_id
(
tensor_id
)
self
.
_have_received_tensor_id
(
tensor_id
)
self
.
recv_store_cv
.
notify
()
#self.recv_store_cv.notify()
self
.
recv_store_cv
.
notify_all
()
elif
data
[
"cmd"
]
==
"GET"
:
elif
data
[
"cmd"
]
==
"GET"
:
tensor_id
=
data
[
"tensor_id"
]
tensor_id
=
data
[
"tensor_id"
]
...
@@ -450,7 +459,7 @@ class P2pNcclEngine:
...
@@ -450,7 +459,7 @@ class P2pNcclEngine:
else
:
else
:
tensor_id
,
remote_address
,
tensor
=
self
.
send_queue
.
popleft
()
tensor_id
,
remote_address
,
tensor
=
self
.
send_queue
.
popleft
()
if
not
self
.
send_queue
:
if
not
self
.
send_queue
:
self
.
send_queue_cv
.
notify
()
self
.
send_queue_cv
.
notify
_all
()
if
(
envs
.
VLLM_ENABLE_TBO
or
envs
.
VLLM_P2P_ASYNC
)
and
tbo_evt
is
not
None
:
if
(
envs
.
VLLM_ENABLE_TBO
or
envs
.
VLLM_P2P_ASYNC
)
and
tbo_evt
is
not
None
:
self
.
send_stream
.
wait_event
(
tbo_evt
)
self
.
send_stream
.
wait_event
(
tbo_evt
)
self
.
_send_kv_p2p_sync
(
tensor_id
,
kv_layer
,
slot_mapping
,
remote_address
)
self
.
_send_kv_p2p_sync
(
tensor_id
,
kv_layer
,
slot_mapping
,
remote_address
)
...
@@ -590,20 +599,30 @@ class P2pNcclEngine:
...
@@ -590,20 +599,30 @@ class P2pNcclEngine:
"""
"""
# Clear the buffer upon request completion.
# Clear the buffer upon request completion.
# for request_id in finished_req_ids:
# for layer_name in forward_context.no_compile_layers:
# tensor_id = request_id + "#" + layer_name
# if tensor_id in self.recv_store:
# with self.recv_store_cv:
# tensor = self.recv_store.pop(tensor_id, None)
# self.send_request_id_to_tensor_ids.pop(
# request_id, None)
# self.recv_request_id_to_tensor_ids.pop(
# request_id, None)
# addr = 0
# if isinstance(tensor, tuple):
# addr, _, _ = tensor
# self.pool.free(addr)
for
request_id
in
finished_req_ids
:
for
request_id
in
finished_req_ids
:
for
layer_name
in
forward_context
.
no_compile_layers
:
ids
=
self
.
recv_request_id_to_tensor_ids
.
pop
(
request_id
,
set
())
tensor_id
=
request_id
+
"#"
+
layer_name
if
tensor_id
in
self
.
recv_store
:
with
self
.
recv_store_cv
:
with
self
.
recv_store_cv
:
for
tensor_id
in
ids
:
tensor
=
self
.
recv_store
.
pop
(
tensor_id
,
None
)
tensor
=
self
.
recv_store
.
pop
(
tensor_id
,
None
)
self
.
send_request_id_to_tensor_ids
.
pop
(
request_id
,
None
)
self
.
recv_request_id_to_tensor_ids
.
pop
(
request_id
,
None
)
addr
=
0
if
isinstance
(
tensor
,
tuple
):
if
isinstance
(
tensor
,
tuple
):
addr
,
_
,
_
=
tensor
addr
,
_
,
_
=
tensor
self
.
pool
.
free
(
addr
)
self
.
pool
.
free
(
addr
)
self
.
send_request_id_to_tensor_ids
.
pop
(
request_id
,
None
)
# TODO:Retrieve requests that have already sent the KV cache.
# TODO:Retrieve requests that have already sent the KV cache.
finished_sending
:
set
[
str
]
=
set
()
finished_sending
:
set
[
str
]
=
set
()
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
7293a072
...
@@ -226,21 +226,7 @@ class DeepseekV2MoE(nn.Module):
...
@@ -226,21 +226,7 @@ class DeepseekV2MoE(nn.Module):
router_logits
,
_
=
self
.
gate
(
hidden_states
)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
if
not
self
.
use_mori_ep
:
if
not
self
.
use_mori_ep
:
if
envs
.
envs
.
VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
:
if
envs
.
VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
:
if
self
.
enable_expert_parallel
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
if
shared_output
is
not
None
:
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
final_hidden_states
+
shared_output
else
:
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
else
:
final_hidden_states
=
self
.
experts
(
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
hidden_states
=
hidden_states
,
router_logits
=
router_logits
,
router_logits
=
router_logits
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment