Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
167 additions
and
3 deletions
+167
-3
vllm/core/block/common.py
vllm/core/block/common.py
+1
-0
vllm/core/block/cpu_gpu_block_allocator.py
vllm/core/block/cpu_gpu_block_allocator.py
+1
-0
vllm/core/block/interfaces.py
vllm/core/block/interfaces.py
+1
-0
vllm/core/block/naive_block.py
vllm/core/block/naive_block.py
+1
-0
vllm/core/block/prefix_caching_block.py
vllm/core/block/prefix_caching_block.py
+1
-0
vllm/core/block/utils.py
vllm/core/block/utils.py
+1
-0
vllm/core/block_manager.py
vllm/core/block_manager.py
+1
-0
vllm/core/evictor.py
vllm/core/evictor.py
+1
-0
vllm/core/interfaces.py
vllm/core/interfaces.py
+1
-0
vllm/core/placeholder_block_space_manager.py
vllm/core/placeholder_block_space_manager.py
+1
-0
vllm/core/scheduler.py
vllm/core/scheduler.py
+1
-0
vllm/device_allocator/cumem.py
vllm/device_allocator/cumem.py
+1
-0
vllm/distributed/__init__.py
vllm/distributed/__init__.py
+1
-0
vllm/distributed/communication_op.py
vllm/distributed/communication_op.py
+1
-0
vllm/distributed/device_communicators/all2all.py
vllm/distributed/device_communicators/all2all.py
+139
-1
vllm/distributed/device_communicators/base_device_communicator.py
...tributed/device_communicators/base_device_communicator.py
+2
-2
vllm/distributed/device_communicators/cpu_communicator.py
vllm/distributed/device_communicators/cpu_communicator.py
+1
-0
vllm/distributed/device_communicators/cuda_communicator.py
vllm/distributed/device_communicators/cuda_communicator.py
+9
-0
vllm/distributed/device_communicators/cuda_wrapper.py
vllm/distributed/device_communicators/cuda_wrapper.py
+1
-0
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
vllm/core/block/common.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections
import
deque
from
dataclasses
import
dataclass
...
...
vllm/core/block/cpu_gpu_block_allocator.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Dict
,
FrozenSet
,
List
,
Optional
,
Tuple
...
...
vllm/core/block/interfaces.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
,
abstractmethod
from
typing
import
Dict
,
FrozenSet
,
List
,
Optional
,
Protocol
,
Tuple
...
...
vllm/core/block/naive_block.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections
import
deque
from
typing
import
Deque
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
...
...
vllm/core/block/prefix_caching_block.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Token blocks."""
import
sys
from
bisect
import
bisect_left
...
...
vllm/core/block/utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Block manager utils."""
from
vllm.sequence
import
SequenceGroup
from
vllm.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
...
...
vllm/core/block_manager.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""A block manager that manages token blocks."""
from
typing
import
Dict
,
List
,
Optional
from
typing
import
Sequence
as
GenericSequence
...
...
vllm/core/evictor.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
enum
import
heapq
...
...
vllm/core/interfaces.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
enum
from
abc
import
ABC
,
abstractmethod
...
...
vllm/core/placeholder_block_space_manager.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
List
,
Optional
,
Tuple
...
...
vllm/core/scheduler.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
enum
import
os
...
...
vllm/device_allocator/cumem.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# cumem-based pytorch pluggable allocator to implement sleep mode.
# other approaches tried but failed:
...
...
vllm/distributed/__init__.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
.communication_op
import
*
from
.parallel_state
import
*
...
...
vllm/distributed/communication_op.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
,
Optional
,
Union
...
...
vllm/distributed/device_communicators/all2all.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
importlib.util
from
typing
import
TYPE_CHECKING
from
typing
import
TYPE_CHECKING
,
Any
import
torch
import
torch.distributed
as
dist
...
...
@@ -124,3 +125,140 @@ class PPLXAll2AllManager(All2AllManagerBase):
from
pplx_kernels.nvshmem
import
nvshmem_finalize
logger
.
debug
(
"PPLX NVSHMEM finalize"
)
nvshmem_finalize
()
class
DeepEPAll2AllManagerBase
(
All2AllManagerBase
):
"""
All2All communication based on DeepEP High-Throughput kernels.
"""
def
__init__
(
self
,
cpu_group
):
has_deepep
=
importlib
.
util
.
find_spec
(
"deep_ep"
)
is
not
None
assert
has_deepep
,
"DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."
# noqa
super
().
__init__
(
cpu_group
)
self
.
handle_cache
=
Cache
()
# This is the DeepEP default. Stick to it till we can establish
# reasonable defaults based on profiling.
self
.
num_sms
=
20
def
get_handle
(
self
,
kwargs
):
raise
NotImplementedError
def
dispatch
(
self
,
hidden_states
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
):
raise
NotImplementedError
def
combine
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
raise
NotImplementedError
def
destroy
(
self
):
pass
class
DeepEPHTAll2AllManager
(
DeepEPAll2AllManagerBase
):
"""
All2All communication based on DeepEP High-Throughput kernels.
"""
def
__init__
(
self
,
cpu_group
):
super
().
__init__
(
cpu_group
)
def
_make_all2all_kwargs
(
self
)
->
dict
[
Any
,
Any
]:
# Defaults for internode and intranode are taken from DeepEP tests.
num_nvl_bytes
=
1024
*
1024
*
1024
num_rdma_bytes
=
None
num_qps_per_rank
=
None
if
self
.
internode
:
num_rdma_bytes
=
1024
*
1024
*
1024
num_qps_per_rank
=
self
.
num_sms
//
2
else
:
num_rdma_bytes
=
0
num_qps_per_rank
=
1
assert
num_rdma_bytes
is
not
None
assert
num_qps_per_rank
is
not
None
return
dict
(
group
=
self
.
cpu_group
,
num_nvl_bytes
=
num_nvl_bytes
,
num_rdma_bytes
=
num_rdma_bytes
,
low_latency_mode
=
False
,
num_qps_per_rank
=
num_qps_per_rank
)
def
get_handle
(
self
,
kwargs
):
assert
len
(
kwargs
)
==
0
,
(
"DeepEPHTAll2AllManager expects no arguments. All the required "
"args are computed in the Manager itself."
)
import
deep_ep
buffer_kwargs
=
self
.
_make_all2all_kwargs
()
logger
.
debug
(
"DeepEP all2all args %s"
,
buffer_kwargs
)
handle
:
deep_ep
.
Buffer
=
self
.
handle_cache
.
get_or_create
(
buffer_kwargs
,
deep_ep
.
Buffer
)
# It is dangerous to set num sms outside this function. num_sms is not
# a part of the hash-key that identifies this object. If we are in a
# situation where we make objects with different num_sms, the hash key
# in get_or_create must be updated.
handle
.
set_num_sms
(
self
.
num_sms
)
return
handle
class
DeepEPLLAll2AllManager
(
DeepEPAll2AllManagerBase
):
"""
All2All communication based on DeepEP Low-Latency kernels.
"""
def
__init__
(
self
,
cpu_group
):
super
().
__init__
(
cpu_group
)
def
_make_all2all_kwargs
(
self
,
max_num_tokens_per_dp_rank
:
int
,
token_hidden_size
:
int
,
num_ep_ranks
:
int
,
num_global_experts
:
int
,
num_local_experts
:
int
,
)
->
dict
[
Any
,
Any
]:
"""
max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank
can dispatch all the ranks must hold the same value.
token_hidden_size: the hidden dimension of each token.
num_ep_ranks: the number of EP group ranks.
num_global_experts: Number of experts in the model.
num_local_experts: Number of experts in an EP rank.
"""
import
deep_ep
# Defaults for internode and intranode are taken from DeepEP tests.
num_nvl_bytes
=
1024
*
1024
*
1024
num_qps_per_rank
=
num_local_experts
num_rdma_bytes
=
deep_ep
.
Buffer
.
get_low_latency_rdma_size_hint
(
num_max_dispatch_tokens_per_rank
=
max_num_tokens_per_dp_rank
,
hidden
=
token_hidden_size
,
num_ranks
=
num_ep_ranks
,
num_experts
=
num_global_experts
)
assert
num_rdma_bytes
is
not
None
return
dict
(
group
=
self
.
cpu_group
,
num_nvl_bytes
=
num_nvl_bytes
,
num_rdma_bytes
=
num_rdma_bytes
,
low_latency_mode
=
True
,
num_qps_per_rank
=
num_qps_per_rank
)
def
get_handle
(
self
,
kwargs
):
"""
The kwargs for DeepEPLLAll2AllManager is dictated by
_make_all2all_kwargs.
"""
import
deep_ep
buffer_kwargs
=
self
.
_make_all2all_kwargs
(
**
kwargs
)
logger
.
debug
(
"DeepEP all2all args %s"
,
buffer_kwargs
)
handle
:
deep_ep
.
Buffer
=
self
.
handle_cache
.
get_or_create
(
buffer_kwargs
,
deep_ep
.
Buffer
)
# It is dangerous to set num sms outside this function. num_sms is not
# a part of the hash-key that identifies this object. If we are in a
# situation where we make objects with different num_sms, the hash key
# in get_or_create must be updated.
handle
.
set_num_sms
(
self
.
num_sms
)
return
handle
vllm/distributed/device_communicators/base_device_communicator.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
threading
from
typing
import
Optional
from
weakref
import
WeakValueDictionary
...
...
@@ -48,8 +49,7 @@ class All2AllManagerBase:
# all2all communication often has separate implementations for
# intra-node and inter-node communication
self
.
intranode
=
in_the_same_node_as
(
cpu_group
,
source_rank
=
0
)
self
.
internode
=
not
self
.
intranode
self
.
internode
=
not
all
(
in_the_same_node_as
(
cpu_group
,
source_rank
=
0
))
def
get_handle
(
self
,
kwargs
):
# get a handle for the all2all communication,
...
...
vllm/distributed/device_communicators/cpu_communicator.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
typing
import
Optional
...
...
vllm/distributed/device_communicators/cuda_communicator.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
...
...
@@ -66,6 +67,14 @@ class CudaCommunicator(DeviceCommunicatorBase):
from
.all2all
import
PPLXAll2AllManager
self
.
all2all_manager
=
PPLXAll2AllManager
(
self
.
cpu_group
)
logger
.
info
(
"Using PPLX all2all manager."
)
elif
all2all_backend
==
"deepep_high_throughput"
:
from
.all2all
import
DeepEPHTAll2AllManager
self
.
all2all_manager
=
DeepEPHTAll2AllManager
(
self
.
cpu_group
)
logger
.
info
(
"Using DeepEP High-Throughput all2all manager."
)
elif
all2all_backend
==
"deepep_low_latency"
:
from
.all2all
import
DeepEPLLAll2AllManager
self
.
all2all_manager
=
DeepEPLLAll2AllManager
(
self
.
cpu_group
)
logger
.
info
(
"Using DeepEP Low-Latency all2all manager."
)
else
:
raise
ValueError
(
f
"Unknown all2all backend:
{
all2all_backend
}
"
)
...
...
vllm/distributed/device_communicators/cuda_wrapper.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""This file is a pure Python wrapper for the cudart library.
It avoids the need to compile a separate shared library, and is
convenient for use when we just need to call a few functions.
...
...
vllm/distributed/device_communicators/custom_all_reduce.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
contextlib
import
contextmanager
from
typing
import
Optional
,
Union
...
...
Prev
1
…
41
42
43
44
45
46
47
48
49
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment