Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8622f218
Commit
8622f218
authored
Feb 28, 2026
by
xuxz
Browse files
Remove redundant code
parent
fa3bae2e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
41 additions
and
201 deletions
+41
-201
vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_connector.py
...uted/kv_transfer/kv_connector/v1/du/du_swift_connector.py
+41
-201
No files found.
vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_connector.py
View file @
8622f218
...
@@ -303,7 +303,6 @@ class DuSwiftConnector(KVConnectorBase_V1):
...
@@ -303,7 +303,6 @@ class DuSwiftConnector(KVConnectorBase_V1):
kv_cache_layer
=
kv_cache
[
\
kv_cache_layer
=
kv_cache
[
\
forward_context
.
virtual_engine
]
forward_context
.
virtual_engine
]
if
not
envs
.
VLLM_P2P_ASYNC
:
kv_cache
=
self
.
du_swift_engine
.
recv_tensor
(
kv_cache
=
self
.
du_swift_engine
.
recv_tensor
(
request
.
request_id
+
"#"
+
layer_name
)
request
.
request_id
+
"#"
+
layer_name
)
...
@@ -324,61 +323,6 @@ class DuSwiftConnector(KVConnectorBase_V1):
...
@@ -324,61 +323,6 @@ class DuSwiftConnector(KVConnectorBase_V1):
if
isinstance
(
tensor
,
tuple
):
if
isinstance
(
tensor
,
tuple
):
addr
,
_
,
_
=
tensor
addr
,
_
,
_
=
tensor
self
.
du_swift_engine
.
pool
.
free
(
addr
)
self
.
du_swift_engine
.
pool
.
free
(
addr
)
else
:
dst_kv_cache_layer_shape
=
kv_cache_layer
.
shape
if
isinstance
(
attn_metadata
,
MLACommonMetadata
)
or
all
(
isinstance
(
value
,
MLACommonMetadata
)
for
value
in
attn_metadata
.
values
()):
num_pages
=
dst_kv_cache_layer_shape
[
0
]
page_size
=
dst_kv_cache_layer_shape
[
1
]
assert
kv_cache_layer
.
is_contiguous
()
dst_kv_cache_layer
=
kv_cache_layer
.
reshape
(
num_pages
*
page_size
,
-
1
)
else
:
num_pages
=
dst_kv_cache_layer_shape
[
1
]
page_size
=
dst_kv_cache_layer_shape
[
2
]
assert
kv_cache_layer
.
is_contiguous
()
dst_kv_cache_layer
=
kv_cache_layer
.
reshape
(
2
,
num_pages
*
page_size
,
-
1
)
inject_start_index
=
0
for
num
in
range
(
self
.
du_swift_engine
.
tensor_split_num
):
kv_cache
=
self
.
du_swift_engine
.
recv_tensor
(
request
.
request_id
+
"#"
+
layer_name
+
"#"
+
str
(
num
))
if
kv_cache
is
None
:
logger
.
warning
(
"🚧src_kv_cache is None, %s"
,
request
.
request_id
)
continue
if
isinstance
(
attn_metadata
,
MLACommonMetadata
)
or
all
(
isinstance
(
value
,
MLACommonMetadata
)
for
value
in
attn_metadata
.
values
()):
num_token
=
kv_cache
.
shape
[
0
]
if
len
(
request
.
slot_mapping
)
==
num_token
:
dst_kv_cache_layer
[
request
.
slot_mapping
,
...]
=
kv_cache
else
:
dst_kv_cache_layer
[
request
.
slot_mapping
[
inject_start_index
:
inject_start_index
+
num_token
],
...]
=
kv_cache
else
:
num_token
=
kv_cache
.
shape
[
1
]
if
len
(
request
.
slot_mapping
)
==
num_token
:
dst_kv_cache_layer
[:,
request
.
slot_mapping
,
...]
=
kv_cache
else
:
dst_kv_cache_layer
[:,
request
.
slot_mapping
[
inject_start_index
:
inject_start_index
+
num_token
],
...]
=
kv_cache
inject_start_index
+=
num_token
# inject_kv_into_layer(kv_cache_layer, kv_cache,
# request.slot_mapping, request.request_id)
tensor_id
=
request
.
request_id
+
"#"
+
layer_name
+
"#"
+
str
(
num
)
if
tensor_id
in
self
.
du_swift_engine
.
recv_store
:
tensor
=
self
.
du_swift_engine
.
recv_store
.
pop
(
tensor_id
,
None
)
self
.
du_swift_engine
.
send_request_id_to_tensor_ids
.
pop
(
request
.
request_id
,
None
)
self
.
du_swift_engine
.
recv_request_id_to_tensor_ids
.
pop
(
request
.
request_id
,
None
)
addr
=
0
if
isinstance
(
tensor
,
tuple
):
addr
,
_
,
_
=
tensor
self
.
du_swift_engine
.
pool
.
free
(
addr
)
dst_kv_cache_layer
.
reshape
(
dst_kv_cache_layer_shape
)
def
wait_for_layer_load
(
self
,
layer_name
:
str
)
->
None
:
def
wait_for_layer_load
(
self
,
layer_name
:
str
)
->
None
:
...
@@ -433,72 +377,10 @@ class DuSwiftConnector(KVConnectorBase_V1):
...
@@ -433,72 +377,10 @@ class DuSwiftConnector(KVConnectorBase_V1):
connector_metadata
=
self
.
_get_connector_metadata
()
connector_metadata
=
self
.
_get_connector_metadata
()
assert
isinstance
(
connector_metadata
,
DuSwiftConnectorMetadata
)
assert
isinstance
(
connector_metadata
,
DuSwiftConnectorMetadata
)
if
envs
.
VLLM_ENABLE_TBO
or
envs
.
VLLM_P2P_ASYNC
:
for
request
in
connector_metadata
.
requests
:
request_id
=
request
.
request_id
ip
,
port
=
self
.
parse_request_id
(
request_id
,
True
)
remote_address
=
ip
+
":"
+
str
(
port
+
self
.
_rank
)
slot_mapping
=
request
.
slot_mapping
if
request
.
slot_mapping_device
is
None
:
request
.
slot_mapping_device
=
\
request
.
slot_mapping
.
pin_memory
().
to
(
device
=
kv_layer
.
device
,
non_blocking
=
True
)
slot_mapping
=
request
.
slot_mapping_device
tbo_evt
=
torch
.
cuda
.
Event
(
enable_timing
=
False
)
tbo_evt
.
record
()
pp_rank
=
(
self
.
parallel_config
.
rank
//
self
.
parallel_config
.
tensor_parallel_size
)
%
\
self
.
parallel_config
.
pipeline_parallel_size
if
(
self
.
pp_size
==
1
):
self
.
du_swift_engine
.
p2p_async_send_tensor
(
request_id
+
"#"
+
layer_name
,
(
kv_layer
,
slot_mapping
),
remote_address
,
tbo_evt
)
elif
(
self
.
pp_size
==
2
):
if
(
pp_rank
==
0
):
self
.
du_swift_engine
.
p2p_async_send_tensor
(
request_id
+
"#"
+
layer_name
,
(
kv_layer
,
slot_mapping
),
remote_address
,
tbo_evt
)
self
.
du_swift_engine
.
p2p_async_send_tensor
(
request_id
+
"#"
+
layer_name
,
(
kv_layer
,
slot_mapping
),
ip
+
":"
+
str
(
port
+
self
.
_rank
+
4
),
tbo_evt
)
else
:
self
.
du_swift_engine
.
p2p_async_send_tensor
(
request_id
+
"#"
+
layer_name
,
(
kv_layer
,
slot_mapping
),
remote_address
,
tbo_evt
)
self
.
du_swift_engine
.
p2p_async_send_tensor
(
request_id
+
"#"
+
layer_name
,
(
kv_layer
,
slot_mapping
),
ip
+
":"
+
str
(
port
+
self
.
_rank
-
4
),
tbo_evt
)
elif
(
self
.
pp_size
==
8
):
for
i
in
range
(
8
):
self
.
du_swift_engine
.
p2p_async_send_tensor
(
request_id
+
"#"
+
layer_name
,
(
kv_layer
,
slot_mapping
),
ip
+
":"
+
str
(
port
+
i
),
tbo_evt
)
else
:
print
(
"Error: only suppprt pp1 pp2 pp8!!!!!!"
)
else
:
for
request
in
connector_metadata
.
requests
:
for
request
in
connector_metadata
.
requests
:
request_id
=
request
.
request_id
request_id
=
request
.
request_id
# ip, port = self.parse_request_id(request_id, True)
# p_ip, p_port = self.parse_request_id(request_id, False)
# remote_address = ip + ":" + str(port + self._rank)
# pd_pair_id = p_ip + ":" + p_port + "_" + ip + ":" + port
kv_cache
=
extract_kv_from_layer
(
kv_layer
,
request
.
slot_mapping
)
kv_cache
=
extract_kv_from_layer
(
kv_layer
,
request
.
slot_mapping
)
# pp_rank = (self.parallel_config.rank // self.parallel_config.tensor_parallel_size
# ) % self.parallel_config.pipeline_parallel_size
# if (self.multiple_machines_p and self.multiple_machines_d):
# ip_second = self.get_ip_value(ip)
# if (self.pp_size == 1):
# if self._rank < 8:
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, remote_address)
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, str(ip_second) + ":" + str(port + self._rank + 8))
# elif (self.pp_size == 2):
# if (pp_rank == 0):
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, remote_address)
# else:
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, str(ip_second) + ":" + str(port + self._rank))
# else:
# logger.error("Error: multiple machines only suppprt pp1tp16 and pp2tp8!!!!!!")
# elif (self.multiple_machines_p and not self.multiple_machines_d):
# if (self.pp_size == 2):
# remote_address = ip + ":" + str(port + self._tp_rank)
pending
=
False
pending
=
False
with
self
.
du_swift_engine
.
req_status_cv
:
with
self
.
du_swift_engine
.
req_status_cv
:
if
request_id
not
in
self
.
du_swift_engine
.
req_status
:
if
request_id
not
in
self
.
du_swift_engine
.
req_status
:
...
@@ -514,44 +396,10 @@ class DuSwiftConnector(KVConnectorBase_V1):
...
@@ -514,44 +396,10 @@ class DuSwiftConnector(KVConnectorBase_V1):
remote_addr
=
RemoteAddr
(
req_data
.
pd_pair_id
,
*
(
req_data
.
zmq_address_and_comm_rank
[
i
]))
remote_addr
=
RemoteAddr
(
req_data
.
pd_pair_id
,
*
(
req_data
.
zmq_address_and_comm_rank
[
i
]))
self
.
du_swift_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
self
.
du_swift_engine
.
send_tensor
(
request_id
+
"#"
+
layer_name
,
kv_cache
,
remote_addr
)
kv_cache
,
remote_addr
)
# kv_cache, remote_address)
# else:
# logger.error("Error: P multiple machines D machine only suppprt P:pp2tp8 D:tp8 !!!!!!")
# elif (not self.multiple_machines_p and not self.multiple_machines_d):
# # remote_addr = RemoteAddr(pd_pair_id, remote_address, self._rank + self.num_card)
# self.du_swift_engine.send_tensor_new(request_id, layer_name, kv_cache,
# is_mla)
# if (self.pp_size == 1):
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, remote_address)
# elif (self.pp_size == 2):
# if (pp_rank == 0):
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, remote_address)
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, ip + ":" + str(port + self._rank + 4))
# else:
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, remote_address)
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, ip + ":" + str(port + self._rank - 4))
# elif (self.pp_size == 8):
# for i in range(8):
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, ip + ":" + str(port + i))
# elif (self.enable_asymmetric_p2p):
# self.du_swift_engine.send_tensor(request_id + "#" + layer_name,
# kv_cache, remote_address)
# else:
# logger.error("Error: P/D single machine only suppprt multiple tp:: (P: pp2tp4 D:tp8 P:pp8tp1 D:tp8) !!!!!!")
# else:
# logger.error("Error: not support!!!!!!")
def
wait_for_save
(
self
):
def
wait_for_save
(
self
):
pass
pass
# if self.is_producer:
# assert self.du_swift_engine is not None
# self.du_swift_engine.wait_for_sent()
def
get_finished
(
def
get_finished
(
self
,
finished_req_ids
:
set
[
str
],
self
,
finished_req_ids
:
set
[
str
],
...
@@ -703,14 +551,6 @@ class DuSwiftConnector(KVConnectorBase_V1):
...
@@ -703,14 +551,6 @@ class DuSwiftConnector(KVConnectorBase_V1):
block_ids
=
block_ids
,
block_ids
=
block_ids
,
block_size
=
self
.
_block_size
)
block_size
=
self
.
_block_size
)
# Requests loaded asynchronously are not in the scheduler_output.
# for request_id in self._requests_need_load:
# request, block_ids = self._requests_need_load[request_id]
# meta.add_request(request_id=request.request_id,
# token_ids=request.prompt_token_ids,
# block_ids=block_ids,
# block_size=self._block_size)
self
.
_requests_need_load
.
clear
()
self
.
_requests_need_load
.
clear
()
return
meta
return
meta
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment