Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
56fef1c3
Commit
56fef1c3
authored
Mar 09, 2026
by
xuxz
Browse files
[PD]适配v0.15.1
parent
cca00f5c
Changes
6
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1987 additions
and
1 deletion
+1987
-1
vllm/distributed/kv_transfer/kv_connector/factory.py
vllm/distributed/kv_transfer/kv_connector/factory.py
+5
-0
vllm/distributed/kv_transfer/kv_connector/v1/du/__init__.py
vllm/distributed/kv_transfer/kv_connector/v1/du/__init__.py
+0
-0
vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_connector.py
...uted/kv_transfer/kv_connector/v1/du/du_swift_connector.py
+724
-0
vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_engine.py
...ributed/kv_transfer/kv_connector/v1/du/du_swift_engine.py
+985
-0
vllm/distributed/kv_transfer/kv_connector/v1/du/tensor_memory_pool.py
...uted/kv_transfer/kv_connector/v1/du/tensor_memory_pool.py
+265
-0
vllm/envs.py
vllm/envs.py
+8
-1
No files found.
vllm/distributed/kv_transfer/kv_connector/factory.py
View file @
56fef1c3
...
@@ -155,6 +155,11 @@ KVConnectorFactory.register_connector(
...
@@ -155,6 +155,11 @@ KVConnectorFactory.register_connector(
"P2pNcclConnector"
,
"P2pNcclConnector"
,
)
)
KVConnectorFactory
.
register_connector
(
"DuSwiftConnector"
,
"vllm.distributed.kv_transfer.kv_connector.v1.du.du_swift_connector"
,
"DuSwiftConnector"
)
KVConnectorFactory
.
register_connector
(
KVConnectorFactory
.
register_connector
(
"LMCacheConnectorV1"
,
"LMCacheConnectorV1"
,
"vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector"
,
"vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector"
,
...
...
vllm/distributed/kv_transfer/kv_connector/v1/du/__init__.py
0 → 100644
View file @
56fef1c3
vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_connector.py
0 → 100644
View file @
56fef1c3
This diff is collapsed.
Click to expand it.
vllm/distributed/kv_transfer/kv_connector/v1/du/du_swift_engine.py
0 → 100644
View file @
56fef1c3
This diff is collapsed.
Click to expand it.
vllm/distributed/kv_transfer/kv_connector/v1/du/tensor_memory_pool.py
0 → 100644
View file @
56fef1c3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
atexit
import
ctypes
import
math
from
dataclasses
import
dataclass
import
torch
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
@
dataclass
class
MemoryBlock
:
size
:
int
addr
:
int
"""A memory pool for managing pinned host memory allocations for tensors.
This class implements a buddy allocation system to efficiently manage pinned
host memory for tensor storage. It supports allocation, deallocation, and
tensor storage/retrieval operations.
Key Features:
- Uses power-of-two block sizes for efficient buddy allocation
- Supports splitting and merging of memory blocks
- Provides methods to store CUDA tensors in pinned host memory
- Allows loading tensors from pinned memory back to device
- Automatically cleans up memory on destruction
Attributes:
max_block_size (int): Maximum block size (rounded to nearest power of two)
min_block_size (int): Minimum block size (rounded to nearest power of two)
free_lists (dict): Dictionary of free memory blocks by size
allocated_blocks (dict): Dictionary of currently allocated blocks
base_tensor (torch.Tensor): Base pinned memory tensor
base_address (int): Base memory address of the pinned memory region
Example:
>>> pool = TensorMemoryPool(max_block_size=1024*1024)
>>> tensor = torch.randn(100, device='cuda')
>>> addr = pool.store_tensor(tensor)
>>> loaded_tensor = pool.load_tensor(addr, tensor.dtype,
... tensor.shape, 'cuda')
>>> pool.free(addr)
"""
class
TensorMemoryPool
:
"""Initializes the memory pool with given size constraints.
Args:
max_block_size (int): Maximum size of memory blocks to manage
min_block_size (int, optional): Minimum size of memory blocks
to manage. Defaults to 512.
Raises:
ValueError: If block sizes are invalid or max_block_size is less
than min_block_size
"""
def
__init__
(
self
,
max_block_size
:
int
,
min_block_size
:
int
=
128
):
if
max_block_size
<=
0
or
min_block_size
<=
0
:
raise
ValueError
(
"Block sizes must be positive"
)
if
max_block_size
<
min_block_size
:
raise
ValueError
(
"Max block size must be greater than min block size"
)
self
.
max_block_size
=
self
.
_round_to_power_of_two
(
max_block_size
)
self
.
min_block_size
=
self
.
_round_to_power_of_two
(
min_block_size
)
self
.
free_lists
:
dict
[
int
,
dict
[
int
,
MemoryBlock
]]
=
{}
self
.
allocated_blocks
:
dict
[
int
,
MemoryBlock
]
=
{}
self
.
_initialize_free_lists
()
self
.
_allocate_pinned_memory
()
atexit
.
register
(
self
.
cleanup
)
def
_round_to_power_of_two
(
self
,
size
:
int
)
->
int
:
return
1
<<
(
size
-
1
).
bit_length
()
def
_initialize_free_lists
(
self
):
size
=
self
.
max_block_size
while
size
>=
self
.
min_block_size
:
self
.
free_lists
[
size
]
=
{}
size
//=
2
def
_allocate_pinned_memory
(
self
):
self
.
base_tensor
=
torch
.
empty
(
self
.
max_block_size
//
4
,
dtype
=
torch
.
float32
,
pin_memory
=
True
)
self
.
base_address
=
self
.
base_tensor
.
data_ptr
()
initial_block
=
MemoryBlock
(
size
=
self
.
max_block_size
,
addr
=
self
.
base_address
)
self
.
free_lists
[
self
.
max_block_size
][
initial_block
.
addr
]
=
initial_block
logger
.
debug
(
"TensorMemoryPool, base_address:"
,
self
.
base_address
,
self
.
base_address
%
self
.
max_block_size
)
def
allocate
(
self
,
size
:
int
)
->
int
:
"""Allocates a memory block of at least the requested size.
Args:
size (int): Minimum size of memory to allocate
Returns:
int: Address of the allocated memory block
Raises:
ValueError: If size is invalid or insufficient memory is available
"""
if
size
<=
0
:
raise
ValueError
(
"Allocation size must be positive"
)
required_size
=
self
.
_round_to_power_of_two
(
max
(
size
,
self
.
min_block_size
))
if
required_size
>
self
.
max_block_size
:
raise
ValueError
(
"Requested size exceeds maximum block size"
)
current_size
=
required_size
while
current_size
<=
self
.
max_block_size
:
if
self
.
free_lists
[
current_size
]:
_
,
block
=
self
.
free_lists
[
current_size
].
popitem
()
self
.
_split_block
(
block
,
required_size
)
self
.
allocated_blocks
[
block
.
addr
]
=
block
return
block
.
addr
current_size
*=
2
raise
ValueError
(
"Insufficient memory"
)
def
_split_block
(
self
,
block
:
MemoryBlock
,
required_size
:
int
):
while
(
block
.
size
>
required_size
and
block
.
size
//
2
>=
self
.
min_block_size
):
buddy_size
=
block
.
size
//
2
buddy_addr
=
block
.
addr
+
buddy_size
buddy
=
MemoryBlock
(
size
=
buddy_size
,
addr
=
buddy_addr
)
block
.
size
=
buddy_size
self
.
free_lists
[
buddy_size
][
buddy
.
addr
]
=
buddy
def
free
(
self
,
addr
:
int
):
"""Frees an allocated memory block.
Args:
addr (int): Address of the block to free
Raises:
ValueError: If address is invalid or not allocated
"""
if
addr
not
in
self
.
allocated_blocks
:
raise
ValueError
(
"Invalid address to free"
)
block
=
self
.
allocated_blocks
.
pop
(
addr
)
self
.
_merge_buddies
(
block
)
def
_merge_buddies
(
self
,
block
:
MemoryBlock
):
MAX_MERGE_DEPTH
=
30
depth
=
0
while
depth
<
MAX_MERGE_DEPTH
:
buddy_offset
=
block
.
size
if
(
block
.
addr
-
self
.
base_address
)
%
(
2
*
block
.
size
)
==
0
else
-
block
.
size
buddy_addr
=
block
.
addr
+
buddy_offset
buddy
=
self
.
free_lists
[
block
.
size
].
get
(
buddy_addr
)
if
buddy
:
del
self
.
free_lists
[
buddy
.
size
][
buddy
.
addr
]
merged_addr
=
min
(
block
.
addr
,
buddy
.
addr
)
merged_size
=
block
.
size
*
2
block
=
MemoryBlock
(
size
=
merged_size
,
addr
=
merged_addr
)
depth
+=
1
else
:
break
self
.
free_lists
[
block
.
size
][
block
.
addr
]
=
block
def
store_tensor
(
self
,
tensor
:
torch
.
Tensor
)
->
int
:
"""Stores a CUDA tensor in pinned host memory.
Args:
tensor (torch.Tensor): CUDA tensor to store
Returns:
int: Address where the tensor is stored
Raises:
ValueError: If tensor is not on CUDA or allocation fails
"""
if
not
tensor
.
is_cuda
:
raise
ValueError
(
"Only CUDA tensors can be stored"
)
size
=
tensor
.
element_size
()
*
tensor
.
numel
()
addr
=
self
.
allocate
(
size
)
block
=
self
.
allocated_blocks
[
addr
]
if
block
.
size
<
size
:
self
.
free
(
addr
)
raise
ValueError
(
f
"Allocated block size
{
block
.
size
}
is smaller than "
f
"required size
{
size
}
"
)
try
:
buffer
=
(
ctypes
.
c_byte
*
block
.
size
).
from_address
(
block
.
addr
)
cpu_tensor
=
torch
.
frombuffer
(
buffer
,
dtype
=
tensor
.
dtype
,
count
=
tensor
.
numel
()).
reshape
(
tensor
.
shape
)
except
ValueError
as
err
:
self
.
free
(
addr
)
raise
ValueError
(
f
"Failed to create tensor view:
{
err
}
"
)
from
err
cpu_tensor
.
copy_
(
tensor
)
return
addr
def
load_tensor
(
self
,
addr
:
int
,
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
...],
device
)
->
torch
.
Tensor
:
"""Loads a tensor from pinned host memory to the specified device.
Args:
addr (int): Address where tensor is stored
dtype (torch.dtype): Data type of the tensor
shape (tuple[int, ...]): Shape of the tensor
device: Target device for the loaded tensor
Returns:
torch.Tensor: The loaded tensor on the specified device
Raises:
ValueError: If address is invalid or sizes don't match
"""
if
addr
not
in
self
.
allocated_blocks
:
raise
ValueError
(
"Invalid address to load"
)
block
=
self
.
allocated_blocks
[
addr
]
num_elements
=
math
.
prod
(
shape
)
dtype_size
=
torch
.
tensor
([],
dtype
=
dtype
).
element_size
()
required_size
=
num_elements
*
dtype_size
if
required_size
>
block
.
size
:
raise
ValueError
(
"Requested tensor size exceeds block size"
)
buffer
=
(
ctypes
.
c_byte
*
block
.
size
).
from_address
(
block
.
addr
)
cpu_tensor
=
torch
.
frombuffer
(
buffer
,
dtype
=
dtype
,
count
=
num_elements
).
reshape
(
shape
)
cuda_tensor
=
torch
.
empty
(
shape
,
dtype
=
dtype
,
device
=
device
)
cuda_tensor
.
copy_
(
cpu_tensor
)
return
cuda_tensor
def
cleanup
(
self
):
"""Cleans up all memory resources and resets the pool state."""
self
.
free_lists
.
clear
()
self
.
allocated_blocks
.
clear
()
if
hasattr
(
self
,
'base_tensor'
):
del
self
.
base_tensor
def
__del__
(
self
):
self
.
cleanup
()
vllm/envs.py
View file @
56fef1c3
...
@@ -281,6 +281,8 @@ if TYPE_CHECKING:
...
@@ -281,6 +281,8 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_MOE_ALIGN
:
bool
=
False
VLLM_USE_LIGHTOP_MOE_ALIGN
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
VLLM_USE_MERGE_ATTN_STATES_OPT
:
bool
=
False
USE_FUSED_RMS_QUANT
:
bool
=
False
USE_FUSED_RMS_QUANT
:
bool
=
False
VLLM_P2P_ASYNC
:
bool
=
False
VLLM_P2P_BUF_TOKENS
:
int
=
30000
USE_FUSED_SILU_MUL_QUANT
:
bool
=
False
USE_FUSED_SILU_MUL_QUANT
:
bool
=
False
VLLM_USE_PD_SPLIT
:
bool
=
False
VLLM_USE_PD_SPLIT
:
bool
=
False
VLLM_USE_PP_SYNC
:
bool
=
False
VLLM_USE_PP_SYNC
:
bool
=
False
...
@@ -1805,7 +1807,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1805,7 +1807,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm will use rmsquant fused op
# vllm will use rmsquant fused op
"USE_FUSED_RMS_QUANT"
:
"USE_FUSED_RMS_QUANT"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_RMS_QUANT"
,
"0"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"USE_FUSED_RMS_QUANT"
,
"0"
))),
# vllm pd separation will be used async
"VLLM_P2P_ASYNC"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_P2P_ASYNC"
,
"0"
))),
# pd separation p2p async buf tokens
"VLLM_P2P_BUF_TOKENS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_P2P_BUF_TOKENS"
,
"30000"
)),
# vllm will use silu_mul_quant fused op
# vllm will use silu_mul_quant fused op
"USE_FUSED_SILU_MUL_QUANT"
:
"USE_FUSED_SILU_MUL_QUANT"
:
lambda
:
(
os
.
getenv
(
"USE_FUSED_SILU_MUL_QUANT"
,
"False"
).
lower
()
in
lambda
:
(
os
.
getenv
(
"USE_FUSED_SILU_MUL_QUANT"
,
"False"
).
lower
()
in
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment