Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
afd0da21
Commit
afd0da21
authored
Feb 03, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.1' into v0.7.1-dev
parents
1a11f127
4f4d427a
Changes
587
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
627 additions
and
363 deletions
+627
-363
vllm/core/block/interfaces.py
vllm/core/block/interfaces.py
+10
-0
vllm/core/block/naive_block.py
vllm/core/block/naive_block.py
+14
-5
vllm/core/block/prefix_caching_block.py
vllm/core/block/prefix_caching_block.py
+43
-1
vllm/core/block_manager.py
vllm/core/block_manager.py
+5
-2
vllm/core/interfaces.py
vllm/core/interfaces.py
+5
-0
vllm/core/placeholder_block_space_manager.py
vllm/core/placeholder_block_space_manager.py
+3
-0
vllm/core/scheduler.py
vllm/core/scheduler.py
+15
-12
vllm/device_allocator/__init__.py
vllm/device_allocator/__init__.py
+0
-0
vllm/device_allocator/cumem.py
vllm/device_allocator/cumem.py
+254
-0
vllm/distributed/device_communicators/pynccl.py
vllm/distributed/device_communicators/pynccl.py
+9
-35
vllm/distributed/device_communicators/shm_broadcast.py
vllm/distributed/device_communicators/shm_broadcast.py
+6
-5
vllm/distributed/kv_transfer/README.md
vllm/distributed/kv_transfer/README.md
+1
-1
vllm/distributed/kv_transfer/kv_connector/factory.py
vllm/distributed/kv_transfer/kv_connector/factory.py
+38
-10
vllm/distributed/kv_transfer/kv_connector/simple_connector.py
.../distributed/kv_transfer/kv_connector/simple_connector.py
+2
-1
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+21
-23
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+79
-46
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+12
-81
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+86
-94
vllm/engine/metrics.py
vllm/engine/metrics.py
+2
-44
vllm/engine/multiprocessing/__init__.py
vllm/engine/multiprocessing/__init__.py
+22
-3
No files found.
Too many changes to show.
To preserve performance only
587 of 587+
files are displayed.
Plain diff
Email patch
vllm/core/block/interfaces.py
View file @
afd0da21
...
@@ -192,6 +192,11 @@ class BlockAllocator(ABC):
...
@@ -192,6 +192,11 @@ class BlockAllocator(ABC):
"""Prefix cache hit rate. -1 means not supported or disabled."""
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
pass
@
abstractmethod
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache."""
pass
class
NoFreeBlocksError
(
ValueError
):
class
NoFreeBlocksError
(
ValueError
):
pass
pass
...
@@ -297,6 +302,11 @@ class DeviceAwareBlockAllocator(ABC):
...
@@ -297,6 +302,11 @@ class DeviceAwareBlockAllocator(ABC):
"""Prefix cache hit rate. -1 means not supported or disabled."""
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
pass
@
abstractmethod
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache."""
pass
@
abstractmethod
@
abstractmethod
def
find_cached_blocks_prefix
(
def
find_cached_blocks_prefix
(
self
,
self
,
...
...
vllm/core/block/naive_block.py
View file @
afd0da21
from
collections
import
deque
from
collections
import
deque
from
typing
import
Deque
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Tuple
from
typing
import
Deque
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
vllm.core.block.common
import
(
BlockPool
,
CopyOnWriteTracker
,
RefCounter
,
from
vllm.core.block.common
import
(
BlockPool
,
CopyOnWriteTracker
,
RefCounter
,
get_all_blocks_recursively
)
get_all_blocks_recursively
)
...
@@ -136,16 +136,18 @@ class NaiveBlockAllocator(BlockAllocator):
...
@@ -136,16 +136,18 @@ class NaiveBlockAllocator(BlockAllocator):
self
.
_refcounter
.
incr
(
block_id
)
self
.
_refcounter
.
incr
(
block_id
)
return
block_id
return
block_id
def
_free_block_id
(
self
,
block
:
Block
)
->
None
:
def
_free_block_id
(
self
,
block
:
Union
[
Block
,
BlockId
])
->
None
:
block_id
=
block
.
block_id
if
isinstance
(
block
,
Block
):
block_id
=
block
.
block_id
block
.
block_id
=
None
else
:
block_id
=
block
assert
block_id
is
not
None
assert
block_id
is
not
None
refcount
=
self
.
_refcounter
.
decr
(
block_id
)
refcount
=
self
.
_refcounter
.
decr
(
block_id
)
if
refcount
==
0
:
if
refcount
==
0
:
self
.
_free_block_indices
.
appendleft
(
block_id
)
self
.
_free_block_indices
.
appendleft
(
block_id
)
block
.
block_id
=
None
def
free
(
self
,
block
:
Block
,
keep_block_object
:
bool
=
False
)
->
None
:
def
free
(
self
,
block
:
Block
,
keep_block_object
:
bool
=
False
)
->
None
:
# Release the physical block id
# Release the physical block id
self
.
_free_block_id
(
block
)
self
.
_free_block_id
(
block
)
...
@@ -154,6 +156,9 @@ class NaiveBlockAllocator(BlockAllocator):
...
@@ -154,6 +156,9 @@ class NaiveBlockAllocator(BlockAllocator):
if
not
keep_block_object
:
if
not
keep_block_object
:
self
.
_block_pool
.
free_block
(
block
)
self
.
_block_pool
.
free_block
(
block
)
def
free_block_id
(
self
,
block_id
:
BlockId
)
->
None
:
self
.
_free_block_id
(
block_id
)
def
fork
(
self
,
last_block
:
Block
)
->
List
[
Block
]:
def
fork
(
self
,
last_block
:
Block
)
->
List
[
Block
]:
"""Creates a new sequence of blocks that shares the same underlying
"""Creates a new sequence of blocks that shares the same underlying
memory as the original sequence.
memory as the original sequence.
...
@@ -325,6 +330,10 @@ class NaiveBlockAllocator(BlockAllocator):
...
@@ -325,6 +330,10 @@ class NaiveBlockAllocator(BlockAllocator):
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
-
1
return
-
1
def
reset_prefix_cache
(
self
)
->
bool
:
"""No prefix cache for naive block allocator."""
return
True
def
find_cached_blocks_prefix
(
self
,
block_hashes
:
List
[
int
])
->
List
[
int
]:
def
find_cached_blocks_prefix
(
self
,
block_hashes
:
List
[
int
])
->
List
[
int
]:
# Not applicable for naive block allocator.
# Not applicable for naive block allocator.
return
[]
return
[]
...
...
vllm/core/block/prefix_caching_block.py
View file @
afd0da21
...
@@ -12,6 +12,7 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
...
@@ -12,6 +12,7 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
from
vllm.core.block.naive_block
import
(
BlockPool
,
NaiveBlock
,
from
vllm.core.block.naive_block
import
(
BlockPool
,
NaiveBlock
,
NaiveBlockAllocator
)
NaiveBlockAllocator
)
from
vllm.core.evictor
import
EvictionPolicy
,
Evictor
,
make_evictor
from
vllm.core.evictor
import
EvictionPolicy
,
Evictor
,
make_evictor
from
vllm.logger
import
init_logger
from
vllm.sequence
import
Sequence
from
vllm.sequence
import
Sequence
PrefixHash
=
int
PrefixHash
=
int
...
@@ -21,6 +22,8 @@ PrefixHash = int
...
@@ -21,6 +22,8 @@ PrefixHash = int
# then we know this block hasn't been accessed yet.
# then we know this block hasn't been accessed yet.
_DEFAULT_LAST_ACCESSED_TIME
=
-
1
_DEFAULT_LAST_ACCESSED_TIME
=
-
1
logger
=
init_logger
(
__name__
)
class
BlockTracker
:
class
BlockTracker
:
"""Used to track the status of a block inside the prefix caching allocator
"""Used to track the status of a block inside the prefix caching allocator
...
@@ -105,7 +108,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
...
@@ -105,7 +108,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
# Evitor used to maintain how we want to handle those computed blocks
# Evitor used to maintain how we want to handle those computed blocks
# if we find memory pressure is high.
# if we find memory pressure is high.
self
.
evictor
:
Evictor
=
make_evictor
(
eviction_policy
)
self
.
eviction_policy
=
eviction_policy
self
.
evictor
:
Evictor
=
make_evictor
(
self
.
eviction_policy
)
# We share the refcounter between allocators. This allows us to promote
# We share the refcounter between allocators. This allows us to promote
# blocks originally allocated in the hashless allocator to immutable
# blocks originally allocated in the hashless allocator to immutable
...
@@ -428,6 +432,44 @@ class PrefixCachingBlockAllocator(BlockAllocator):
...
@@ -428,6 +432,44 @@ class PrefixCachingBlockAllocator(BlockAllocator):
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
self
.
metric_data
.
get_hit_rate
()
return
self
.
metric_data
.
get_hit_rate
()
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache. This function may be used in RLHF
flows to invalid prefix caching after the weights are updated,
or used for resetting prefix caching status for benchmarking.
Returns:
bool: True if the prefix cache is successfully reset,
False otherwise.
"""
num_used_blocks
=
(
self
.
get_num_total_blocks
()
-
self
.
get_num_free_blocks
())
if
num_used_blocks
>
0
:
logger
.
warning
(
"Failed to reset prefix cache because some "
"blocks (%d) are not freed yet"
,
num_used_blocks
)
return
False
# Free all blocks in the evictor.
while
(
block_id
:
=
self
.
_maybe_allocate_evicted_block_id
())
is
not
None
:
self
.
_hashless_allocator
.
free_block_id
(
block_id
)
# Should not have any cached blocks because all blocks are evicted.
assert
not
self
.
_cached_blocks
# Reset the evictor.
self
.
evictor
=
make_evictor
(
self
.
eviction_policy
)
# Reset the block tracker.
for
block_id
in
self
.
_block_tracker
:
self
.
_block_tracker
[
block_id
]
=
BlockTracker
()
# Reset the metrics.
self
.
metric_data
=
CacheMetricData
()
logger
.
info
(
"Successfully reset prefix cache"
)
return
True
def
is_block_cached
(
self
,
block
:
Block
)
->
bool
:
def
is_block_cached
(
self
,
block
:
Block
)
->
bool
:
assert
block
.
content_hash
is
not
None
assert
block
.
content_hash
is
not
None
return
block
.
content_hash
in
self
.
_cached_blocks
return
block
.
content_hash
in
self
.
_cached_blocks
...
...
vllm/core/block_manager.py
View file @
afd0da21
...
@@ -136,8 +136,8 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
...
@@ -136,8 +136,8 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
device
=
Device
.
GPU
)
device
=
Device
.
GPU
)
# Use watermark to avoid frequent cache eviction.
# Use watermark to avoid frequent cache eviction.
if
(
self
.
num_total_gpu_blocks
-
num_required_blocks
<
if
(
self
.
num_total_gpu_blocks
-
num_required_blocks
self
.
watermark_blocks
):
<
self
.
watermark_blocks
):
return
AllocStatus
.
NEVER
return
AllocStatus
.
NEVER
if
num_free_gpu_blocks
-
num_required_blocks
>=
self
.
watermark_blocks
:
if
num_free_gpu_blocks
-
num_required_blocks
>=
self
.
watermark_blocks
:
return
AllocStatus
.
OK
return
AllocStatus
.
OK
...
@@ -455,6 +455,9 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
...
@@ -455,6 +455,9 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
self
.
block_allocator
.
get_prefix_cache_hit_rate
(
device
)
return
self
.
block_allocator
.
get_prefix_cache_hit_rate
(
device
)
def
reset_prefix_cache
(
self
)
->
bool
:
return
self
.
block_allocator
.
reset_prefix_cache
()
def
_can_swap
(
self
,
def
_can_swap
(
self
,
seq_group
:
SequenceGroup
,
seq_group
:
SequenceGroup
,
device
:
Device
,
device
:
Device
,
...
...
vllm/core/interfaces.py
View file @
afd0da21
...
@@ -122,6 +122,11 @@ class BlockSpaceManager(ABC):
...
@@ -122,6 +122,11 @@ class BlockSpaceManager(ABC):
"""Prefix cache hit rate. -1 means not supported or disabled."""
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
pass
@
abstractmethod
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache for all devices."""
pass
@
abstractmethod
@
abstractmethod
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
pass
pass
vllm/core/placeholder_block_space_manager.py
View file @
afd0da21
...
@@ -90,5 +90,8 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager):
...
@@ -90,5 +90,8 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager):
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
-
1
return
-
1
def
reset_prefix_cache
(
self
)
->
bool
:
return
True
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
return
0
return
0
vllm/core/scheduler.py
View file @
afd0da21
...
@@ -504,6 +504,9 @@ class Scheduler:
...
@@ -504,6 +504,9 @@ class Scheduler:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
self
.
block_manager
.
get_prefix_cache_hit_rate
(
device
)
return
self
.
block_manager
.
get_prefix_cache_hit_rate
(
device
)
def
reset_prefix_cache
(
self
)
->
bool
:
return
self
.
block_manager
.
reset_prefix_cache
()
def
get_num_unfinished_seq_groups
(
self
)
->
int
:
def
get_num_unfinished_seq_groups
(
self
)
->
int
:
return
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
len
(
self
.
swapped
)
return
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
len
(
self
.
swapped
)
...
@@ -985,8 +988,8 @@ class Scheduler:
...
@@ -985,8 +988,8 @@ class Scheduler:
waiting_queue
.
popleft
()
waiting_queue
.
popleft
()
continue
continue
if
(
budget
.
num_batched_tokens
>=
if
(
budget
.
num_batched_tokens
self
.
scheduler_config
.
max_num_batched_tokens
):
>=
self
.
scheduler_config
.
max_num_batched_tokens
):
# We've reached the budget limit - since there might be
# We've reached the budget limit - since there might be
# continuous prefills in the running queue, we should break
# continuous prefills in the running queue, we should break
# to avoid scheduling any new prefills.
# to avoid scheduling any new prefills.
...
@@ -1093,8 +1096,8 @@ class Scheduler:
...
@@ -1093,8 +1096,8 @@ class Scheduler:
running_scheduled
.
swapped_out
)
==
0
:
running_scheduled
.
swapped_out
)
==
0
:
swapped_in
=
self
.
_schedule_swapped
(
budget
,
curr_loras
)
swapped_in
=
self
.
_schedule_swapped
(
budget
,
curr_loras
)
assert
(
budget
.
num_batched_tokens
<=
assert
(
budget
.
num_batched_tokens
self
.
scheduler_config
.
max_num_batched_tokens
)
<=
self
.
scheduler_config
.
max_num_batched_tokens
)
assert
budget
.
num_curr_seqs
<=
self
.
scheduler_config
.
max_num_seqs
assert
budget
.
num_curr_seqs
<=
self
.
scheduler_config
.
max_num_seqs
# Update waiting requests.
# Update waiting requests.
...
@@ -1186,8 +1189,8 @@ class Scheduler:
...
@@ -1186,8 +1189,8 @@ class Scheduler:
curr_loras
,
curr_loras
,
enable_chunking
=
True
)
enable_chunking
=
True
)
assert
(
budget
.
num_batched_tokens
<=
assert
(
budget
.
num_batched_tokens
self
.
scheduler_config
.
max_num_batched_tokens
)
<=
self
.
scheduler_config
.
max_num_batched_tokens
)
assert
budget
.
num_curr_seqs
<=
self
.
scheduler_config
.
max_num_seqs
assert
budget
.
num_curr_seqs
<=
self
.
scheduler_config
.
max_num_seqs
# Update waiting requests.
# Update waiting requests.
...
@@ -1355,8 +1358,8 @@ class Scheduler:
...
@@ -1355,8 +1358,8 @@ class Scheduler:
# NOTE: We use get_len instead of get_prompt_len because when
# NOTE: We use get_len instead of get_prompt_len because when
# a sequence is preempted, prefill includes previous generated
# a sequence is preempted, prefill includes previous generated
# output tokens.
# output tokens.
if
(
token_chunk_size
+
num_computed_tokens
<
if
(
token_chunk_size
+
num_computed_tokens
seqs
[
0
].
data
.
get_len
()):
<
seqs
[
0
].
data
.
get_len
()):
do_sample
=
False
do_sample
=
False
# It assumes the scheduled_seq_groups is ordered by
# It assumes the scheduled_seq_groups is ordered by
...
@@ -1579,6 +1582,7 @@ class Scheduler:
...
@@ -1579,6 +1582,7 @@ class Scheduler:
seq
.
status
=
SequenceStatus
.
WAITING
seq
.
status
=
SequenceStatus
.
WAITING
self
.
free_seq
(
seq
)
self
.
free_seq
(
seq
)
seq
.
reset_state_for_recompute
()
seq
.
reset_state_for_recompute
()
self
.
_free_seq_group_cross_attn_blocks
(
seq_group
)
def
_preempt_by_swap
(
def
_preempt_by_swap
(
self
,
self
,
...
@@ -1621,10 +1625,9 @@ class Scheduler:
...
@@ -1621,10 +1625,9 @@ class Scheduler:
if
self
.
scheduler_config
.
delay_factor
>
0
and
self
.
waiting
:
if
self
.
scheduler_config
.
delay_factor
>
0
and
self
.
waiting
:
earliest_arrival_time
=
min
(
earliest_arrival_time
=
min
(
[
e
.
metrics
.
arrival_time
for
e
in
self
.
waiting
])
[
e
.
metrics
.
arrival_time
for
e
in
self
.
waiting
])
passed_delay
=
(
passed_delay
=
((
now
-
earliest_arrival_time
)
(
now
-
earliest_arrival_time
)
>
>
(
self
.
scheduler_config
.
delay_factor
*
(
self
.
scheduler_config
.
delay_factor
*
self
.
last_prompt_latency
)
self
.
last_prompt_latency
)
or
not
self
.
running
)
or
not
self
.
running
)
else
:
else
:
passed_delay
=
True
passed_delay
=
True
return
passed_delay
return
passed_delay
...
...
vllm/device_allocator/__init__.py
0 → 100644
View file @
afd0da21
vllm/device_allocator/cumem.py
0 → 100644
View file @
afd0da21
# cumem-based pytorch pluggable allocator to implement sleep mode.
# other approaches tried but failed:
# - cuda-python package binding
# - custom libcuda driver ctypes wrapper
# both of them failed because of cuda context mismatch.
# not sure why, they are created from a different context.
# the only successful approach is to call cuda driver API in C.
import
dataclasses
from
contextlib
import
contextmanager
from
typing
import
Callable
,
Dict
,
Optional
,
Tuple
,
Union
import
torch
from
vllm.utils
import
is_pin_memory_available
def
find_loaded_library
(
lib_name
)
->
Optional
[
str
]:
"""
According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
the file `/proc/self/maps` contains the memory maps of the process, which includes the
shared libraries loaded by the process. We can use this file to find the path of the
a loaded library.
"""
# noqa
found_line
=
None
with
open
(
"/proc/self/maps"
)
as
f
:
for
line
in
f
:
if
lib_name
in
line
:
found_line
=
line
break
if
found_line
is
None
:
# the library is not loaded in the current process
return
None
# if lib_name is libcudart, we need to match a line with:
# address /path/to/libcudart-hash.so.11.0
start
=
found_line
.
index
(
"/"
)
path
=
found_line
[
start
:].
strip
()
filename
=
path
.
split
(
"/"
)[
-
1
]
assert
filename
.
rpartition
(
".so"
)[
0
].
startswith
(
lib_name
),
\
f
"Unexpected filename:
{
filename
}
for library
{
lib_name
}
"
return
path
cumem_available
=
False
try
:
from
vllm.cumem_allocator
import
(
init_module
,
python_create_and_map
,
python_unmap_and_release
)
from
vllm.distributed.device_communicators.cuda_wrapper
import
(
CudaRTLibrary
)
lib_name
=
find_loaded_library
(
"cumem_allocator"
)
libcudart
=
CudaRTLibrary
()
cumem_available
=
True
except
ModuleNotFoundError
:
# rocm platform does not support cumem allocator
init_module
=
None
python_create_and_map
=
None
python_unmap_and_release
=
None
CudaRTLibrary
=
None
lib_name
=
None
libcudart
=
None
# py_device, py_alignedSize, py_d_mem, py_p_memHandle
HandleType
=
Tuple
[
int
,
int
,
int
,
int
]
@
dataclasses
.
dataclass
class
AllocationData
:
handle
:
HandleType
tag
:
str
cpu_backup_tensor
:
Optional
[
torch
.
Tensor
]
=
None
def
create_and_map
(
allocation_handle
:
HandleType
)
->
None
:
python_create_and_map
(
*
allocation_handle
)
def
unmap_and_release
(
allocation_handle
:
HandleType
)
->
None
:
python_unmap_and_release
(
*
allocation_handle
)
def
get_pluggable_allocator
(
python_malloc_fn
:
Callable
[[
int
],
int
],
python_free_func
:
Callable
[[
int
,
int
],
None
]
)
->
torch
.
cuda
.
memory
.
CUDAPluggableAllocator
:
init_module
(
python_malloc_fn
,
python_free_func
)
new_alloc
=
torch
.
cuda
.
memory
.
CUDAPluggableAllocator
(
lib_name
,
'my_malloc'
,
'my_free'
)
return
new_alloc
@
contextmanager
def
use_memory_pool_with_allocator
(
python_malloc_fn
:
Callable
[[
int
],
int
],
python_free_func
:
Callable
[[
int
,
int
],
None
])
->
None
:
new_alloc
=
get_pluggable_allocator
(
python_malloc_fn
,
python_free_func
)
mem_pool
=
torch
.
cuda
.
memory
.
MemPool
(
new_alloc
.
_allocator
)
with
torch
.
cuda
.
memory
.
use_mem_pool
(
mem_pool
):
yield
mem_pool
class
CuMemAllocator
:
"""
A singleton class that manages a memory pool for CUDA tensors.
The memory in this pool can be offloaded or discarded when the
allocator sleeps.
Inside the `use_memory_pool(tag)` context, all tensors created will
be allocated in the memory pool, and has the same tag as the
tag passed to the context.
When we call `sleep`, all tensors with the specified tag will be
offloaded to CPU memory, and the rest of the tensors will be discarded.
When we call `wake_up`, all tensors that are previously offloaded
will be loaded back to GPU memory, and the rest of the tensors will
have empty memory.
Why it needs to be a singleton?
When allocated tensors are garbage collected, PyTorch will call
the free callback, which will call the `python_free_callback` method.
The C-extension uses a global variable to store the function of an
instance of this class. If we create multiple instances of this class,
the global variable will be overwritten and the free callback will
not work as expected.
"""
instance
:
"CuMemAllocator"
=
None
default_tag
:
str
=
"default"
@
staticmethod
def
get_instance
()
->
"CuMemAllocator"
:
"""
CuMemAllocator is a singleton class.
We cannot call the constructor directly.
Call this method to get the instance.
"""
assert
cumem_available
,
"cumem allocator is not available"
if
CuMemAllocator
.
instance
is
None
:
CuMemAllocator
.
instance
=
CuMemAllocator
()
return
CuMemAllocator
.
instance
def
__init__
(
self
):
self
.
pointer_to_data
:
Dict
[
int
,
AllocationData
]
=
{}
self
.
current_tag
:
str
=
CuMemAllocator
.
default_tag
def
python_malloc_callback
(
self
,
allocation_handle
:
HandleType
)
->
None
:
"""
Internal method to store the allocation data
when memory is allocated in the memory pool."""
py_d_mem
=
allocation_handle
[
2
]
self
.
pointer_to_data
[
py_d_mem
]
=
AllocationData
(
allocation_handle
,
self
.
current_tag
)
return
def
python_free_callback
(
self
,
ptr
:
int
)
->
HandleType
:
"""
Internal method to look up the allocation data
when memory is freed in the memory pool."""
data
=
self
.
pointer_to_data
.
pop
(
ptr
)
if
data
.
cpu_backup_tensor
is
not
None
:
data
.
cpu_backup_tensor
=
None
return
data
.
handle
def
sleep
(
self
,
offload_tags
:
Optional
[
Union
[
Tuple
[
str
,
...],
str
]]
=
None
)
->
None
:
"""
Put the allocator in sleep mode.
All data in the memory allocation with the specified tag will be
offloaded to CPU memory, and others will be discarded.
:param offload_tags: The tags of the memory allocation that will be
offloaded. The rest of the memory allocation will be discarded.
"""
if
offload_tags
is
None
:
# by default, allocated tensors are offloaded
# when the allocator sleeps
offload_tags
=
(
CuMemAllocator
.
default_tag
,
)
elif
isinstance
(
offload_tags
,
str
):
offload_tags
=
(
offload_tags
,
)
assert
isinstance
(
offload_tags
,
tuple
)
for
ptr
,
data
in
self
.
pointer_to_data
.
items
():
handle
=
data
.
handle
if
data
.
tag
in
offload_tags
:
size_in_bytes
=
handle
[
1
]
cpu_backup_tensor
=
torch
.
empty
(
size_in_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cpu'
,
pin_memory
=
is_pin_memory_available
())
cpu_ptr
=
cpu_backup_tensor
.
data_ptr
()
libcudart
.
cudaMemcpy
(
cpu_ptr
,
ptr
,
size_in_bytes
)
data
.
cpu_backup_tensor
=
cpu_backup_tensor
unmap_and_release
(
handle
)
def
wake_up
(
self
):
"""
Wake up the allocator from sleep mode.
All data that is previously offloaded will be loaded back to GPU
memory, and the rest of the data will have empty memory."""
for
ptr
,
data
in
self
.
pointer_to_data
.
items
():
handle
=
data
.
handle
create_and_map
(
handle
)
if
data
.
cpu_backup_tensor
is
not
None
:
cpu_backup_tensor
=
data
.
cpu_backup_tensor
if
cpu_backup_tensor
is
not
None
:
size_in_bytes
=
cpu_backup_tensor
.
numel
(
)
*
cpu_backup_tensor
.
element_size
()
cpu_ptr
=
cpu_backup_tensor
.
data_ptr
()
libcudart
.
cudaMemcpy
(
ptr
,
cpu_ptr
,
size_in_bytes
)
data
.
cpu_backup_tensor
=
None
@
contextmanager
def
use_memory_pool
(
self
,
tag
:
Optional
[
str
]
=
None
):
"""
A context manager to use the memory pool.
All memory allocation created inside the context will be allocated
in the memory pool, and has the specified tag.
:param tag: The tag of the memory allocation. If None, the default tag
will be used.
"""
if
tag
is
None
:
tag
=
CuMemAllocator
.
default_tag
assert
isinstance
(
tag
,
str
)
old_tag
=
self
.
current_tag
self
.
current_tag
=
tag
with
use_memory_pool_with_allocator
(
self
.
python_malloc_callback
,
self
.
python_free_callback
):
yield
# PyTorch's bug, calling torch.cuda.empty_cache() will error
# when using pluggable allocator, see
# https://github.com/pytorch/pytorch/issues/145168 .
# if we have some memory allocated and then freed,
# the memory will not be released.
# right now it is fine, because we only use this allocator
# during weight loading and kv cache creation, where we only
# allocate memory.
# TODO: we need to find a way to release the memory,
# i.e. calling torch.cuda.empty_cache()
self
.
current_tag
=
old_tag
def
get_current_usage
(
self
)
->
int
:
"""
Get the total number of bytes allocated in the memory pool.
"""
sum_bytes
:
int
=
0
for
ptr
,
data
in
self
.
pointer_to_data
.
items
():
handle
=
data
.
handle
sum_bytes
+=
handle
[
1
]
return
sum_bytes
vllm/distributed/device_communicators/pynccl.py
View file @
afd0da21
from
contextlib
import
contextmanager
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
# ===================== import region =====================
# ===================== import region =====================
...
@@ -11,6 +10,7 @@ from vllm.distributed.device_communicators.pynccl_wrapper import (
...
@@ -11,6 +10,7 @@ from vllm.distributed.device_communicators.pynccl_wrapper import (
ncclRedOpTypeEnum
,
ncclUniqueId
)
ncclRedOpTypeEnum
,
ncclUniqueId
)
from
vllm.distributed.utils
import
StatelessProcessGroup
from
vllm.distributed.utils
import
StatelessProcessGroup
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
current_stream
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -51,7 +51,6 @@ class PyNcclCommunicator:
...
@@ -51,7 +51,6 @@ class PyNcclCommunicator:
if
self
.
world_size
==
1
:
if
self
.
world_size
==
1
:
self
.
available
=
False
self
.
available
=
False
self
.
disabled
=
True
self
.
disabled
=
True
self
.
stream
=
None
return
return
try
:
try
:
self
.
nccl
=
NCCLLibrary
(
library_path
)
self
.
nccl
=
NCCLLibrary
(
library_path
)
...
@@ -60,7 +59,6 @@ class PyNcclCommunicator:
...
@@ -60,7 +59,6 @@ class PyNcclCommunicator:
# e.g. in a non-GPU environment
# e.g. in a non-GPU environment
self
.
available
=
False
self
.
available
=
False
self
.
disabled
=
True
self
.
disabled
=
True
self
.
stream
=
None
return
return
self
.
available
=
True
self
.
available
=
True
...
@@ -98,12 +96,12 @@ class PyNcclCommunicator:
...
@@ -98,12 +96,12 @@ class PyNcclCommunicator:
with
torch
.
cuda
.
device
(
device
):
with
torch
.
cuda
.
device
(
device
):
self
.
comm
:
ncclComm_t
=
self
.
nccl
.
ncclCommInitRank
(
self
.
comm
:
ncclComm_t
=
self
.
nccl
.
ncclCommInitRank
(
self
.
world_size
,
self
.
unique_id
,
self
.
rank
)
self
.
world_size
,
self
.
unique_id
,
self
.
rank
)
self
.
stream
=
torch
.
cuda
.
Stream
()
stream
=
current_stream
()
# A small all_reduce for warmup.
# A small all_reduce for warmup.
data
=
torch
.
zeros
(
1
,
device
=
device
)
data
=
torch
.
zeros
(
1
,
device
=
device
)
self
.
all_reduce
(
data
)
self
.
all_reduce
(
data
)
self
.
stream
.
synchronize
()
stream
.
synchronize
()
del
data
del
data
def
all_reduce
(
self
,
def
all_reduce
(
self
,
...
@@ -122,7 +120,7 @@ class PyNcclCommunicator:
...
@@ -122,7 +120,7 @@ class PyNcclCommunicator:
out_tensor
=
torch
.
empty_like
(
in_tensor
)
out_tensor
=
torch
.
empty_like
(
in_tensor
)
if
stream
is
None
:
if
stream
is
None
:
stream
=
self
.
stream
stream
=
current_
stream
()
self
.
nccl
.
ncclAllReduce
(
buffer_type
(
in_tensor
.
data_ptr
()),
self
.
nccl
.
ncclAllReduce
(
buffer_type
(
in_tensor
.
data_ptr
()),
buffer_type
(
out_tensor
.
data_ptr
()),
buffer_type
(
out_tensor
.
data_ptr
()),
in_tensor
.
numel
(),
in_tensor
.
numel
(),
...
@@ -144,7 +142,7 @@ class PyNcclCommunicator:
...
@@ -144,7 +142,7 @@ class PyNcclCommunicator:
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"but the input tensor is on
{
input_tensor
.
device
}
"
)
f
"but the input tensor is on
{
input_tensor
.
device
}
"
)
if
stream
is
None
:
if
stream
is
None
:
stream
=
self
.
stream
stream
=
current_
stream
()
self
.
nccl
.
ncclAllGather
(
self
.
nccl
.
ncclAllGather
(
buffer_type
(
input_tensor
.
data_ptr
()),
buffer_type
(
input_tensor
.
data_ptr
()),
buffer_type
(
output_tensor
.
data_ptr
()),
input_tensor
.
numel
(),
buffer_type
(
output_tensor
.
data_ptr
()),
input_tensor
.
numel
(),
...
@@ -165,7 +163,7 @@ class PyNcclCommunicator:
...
@@ -165,7 +163,7 @@ class PyNcclCommunicator:
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"but the input tensor is on
{
input_tensor
.
device
}
"
)
f
"but the input tensor is on
{
input_tensor
.
device
}
"
)
if
stream
is
None
:
if
stream
is
None
:
stream
=
self
.
stream
stream
=
current_
stream
()
self
.
nccl
.
ncclReduceScatter
(
self
.
nccl
.
ncclReduceScatter
(
buffer_type
(
input_tensor
.
data_ptr
()),
buffer_type
(
input_tensor
.
data_ptr
()),
buffer_type
(
output_tensor
.
data_ptr
()),
output_tensor
.
numel
(),
buffer_type
(
output_tensor
.
data_ptr
()),
output_tensor
.
numel
(),
...
@@ -180,7 +178,7 @@ class PyNcclCommunicator:
...
@@ -180,7 +178,7 @@ class PyNcclCommunicator:
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"but the input tensor is on
{
tensor
.
device
}
"
)
f
"but the input tensor is on
{
tensor
.
device
}
"
)
if
stream
is
None
:
if
stream
is
None
:
stream
=
self
.
stream
stream
=
current_
stream
()
self
.
nccl
.
ncclSend
(
buffer_type
(
tensor
.
data_ptr
()),
tensor
.
numel
(),
self
.
nccl
.
ncclSend
(
buffer_type
(
tensor
.
data_ptr
()),
tensor
.
numel
(),
ncclDataTypeEnum
.
from_torch
(
tensor
.
dtype
),
dst
,
ncclDataTypeEnum
.
from_torch
(
tensor
.
dtype
),
dst
,
self
.
comm
,
cudaStream_t
(
stream
.
cuda_stream
))
self
.
comm
,
cudaStream_t
(
stream
.
cuda_stream
))
...
@@ -192,7 +190,7 @@ class PyNcclCommunicator:
...
@@ -192,7 +190,7 @@ class PyNcclCommunicator:
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"but the input tensor is on
{
tensor
.
device
}
"
)
f
"but the input tensor is on
{
tensor
.
device
}
"
)
if
stream
is
None
:
if
stream
is
None
:
stream
=
self
.
stream
stream
=
current_
stream
()
self
.
nccl
.
ncclRecv
(
buffer_type
(
tensor
.
data_ptr
()),
tensor
.
numel
(),
self
.
nccl
.
ncclRecv
(
buffer_type
(
tensor
.
data_ptr
()),
tensor
.
numel
(),
ncclDataTypeEnum
.
from_torch
(
tensor
.
dtype
),
src
,
ncclDataTypeEnum
.
from_torch
(
tensor
.
dtype
),
src
,
self
.
comm
,
cudaStream_t
(
stream
.
cuda_stream
))
self
.
comm
,
cudaStream_t
(
stream
.
cuda_stream
))
...
@@ -204,7 +202,7 @@ class PyNcclCommunicator:
...
@@ -204,7 +202,7 @@ class PyNcclCommunicator:
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"this nccl communicator is created to work on
{
self
.
device
}
, "
f
"but the input tensor is on
{
tensor
.
device
}
"
)
f
"but the input tensor is on
{
tensor
.
device
}
"
)
if
stream
is
None
:
if
stream
is
None
:
stream
=
self
.
stream
stream
=
current_
stream
()
if
src
==
self
.
rank
:
if
src
==
self
.
rank
:
sendbuff
=
buffer_type
(
tensor
.
data_ptr
())
sendbuff
=
buffer_type
(
tensor
.
data_ptr
())
# NCCL requires the sender also to have a receive buffer
# NCCL requires the sender also to have a receive buffer
...
@@ -215,27 +213,3 @@ class PyNcclCommunicator:
...
@@ -215,27 +213,3 @@ class PyNcclCommunicator:
self
.
nccl
.
ncclBroadcast
(
sendbuff
,
recvbuff
,
tensor
.
numel
(),
self
.
nccl
.
ncclBroadcast
(
sendbuff
,
recvbuff
,
tensor
.
numel
(),
ncclDataTypeEnum
.
from_torch
(
tensor
.
dtype
),
src
,
ncclDataTypeEnum
.
from_torch
(
tensor
.
dtype
),
src
,
self
.
comm
,
cudaStream_t
(
stream
.
cuda_stream
))
self
.
comm
,
cudaStream_t
(
stream
.
cuda_stream
))
@
contextmanager
def
change_state
(
self
,
enable
:
Optional
[
bool
]
=
None
,
stream
:
Optional
[
torch
.
cuda
.
Stream
]
=
None
):
"""
A context manager to change the state of the communicator.
"""
if
enable
is
None
:
# guess a default value when not specified
enable
=
self
.
available
if
stream
is
None
:
stream
=
self
.
stream
old_disable
=
self
.
disabled
old_stream
=
self
.
stream
self
.
stream
=
stream
self
.
disabled
=
not
enable
yield
self
.
disabled
=
old_disable
self
.
stream
=
old_stream
vllm/distributed/device_communicators/shm_broadcast.py
View file @
afd0da21
...
@@ -247,7 +247,8 @@ class MessageQueue:
...
@@ -247,7 +247,8 @@ class MessageQueue:
self
.
handle
=
Handle
(
self
.
handle
=
Handle
(
connect_ip
=
connect_ip
,
connect_ip
=
connect_ip
,
local_reader_ranks
=
local_reader_ranks
,
local_reader_ranks
=
local_reader_ranks
,
buffer_handle
=
self
.
buffer
.
handle
(),
buffer_handle
=
self
.
buffer
.
handle
()
if
self
.
buffer
is
not
None
else
None
,
local_subscribe_port
=
local_subscribe_port
,
local_subscribe_port
=
local_subscribe_port
,
remote_subscribe_port
=
remote_subscribe_port
,
remote_subscribe_port
=
remote_subscribe_port
,
)
)
...
@@ -351,8 +352,8 @@ class MessageQueue:
...
@@ -351,8 +352,8 @@ class MessageQueue:
sched_yield
()
sched_yield
()
# if we wait for a long time, log a message
# if we wait for a long time, log a message
if
(
time
.
monotonic
()
-
start_time
>
if
(
time
.
monotonic
()
-
start_time
VLLM_RINGBUFFER_WARNING_INTERVAL
*
n_warning
):
>
VLLM_RINGBUFFER_WARNING_INTERVAL
*
n_warning
):
logger
.
debug
(
"No available block found in %s second. "
,
logger
.
debug
(
"No available block found in %s second. "
,
VLLM_RINGBUFFER_WARNING_INTERVAL
)
VLLM_RINGBUFFER_WARNING_INTERVAL
)
n_warning
+=
1
n_warning
+=
1
...
@@ -409,8 +410,8 @@ class MessageQueue:
...
@@ -409,8 +410,8 @@ class MessageQueue:
sched_yield
()
sched_yield
()
# if we wait for a long time, log a message
# if we wait for a long time, log a message
if
(
time
.
monotonic
()
-
start_time
>
if
(
time
.
monotonic
()
-
start_time
VLLM_RINGBUFFER_WARNING_INTERVAL
*
n_warning
):
>
VLLM_RINGBUFFER_WARNING_INTERVAL
*
n_warning
):
logger
.
debug
(
"No available block found in %s second. "
,
logger
.
debug
(
"No available block found in %s second. "
,
VLLM_RINGBUFFER_WARNING_INTERVAL
)
VLLM_RINGBUFFER_WARNING_INTERVAL
)
n_warning
+=
1
n_warning
+=
1
...
...
vllm/distributed/kv_transfer/README.md
View file @
afd0da21
...
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
...
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
## Disaggregated prefilling
## Disaggregated prefilling
The example usage is in
[
this file
](
../../../examples/disaggregated_prefill.sh
)
.
The example usage is in
[
this file
](
../../../examples/
online_serving/
disaggregated_prefill.sh
)
.
Here is the diagram of how we run disaggretgated prefilling.
Here is the diagram of how we run disaggretgated prefilling.
...
...
vllm/distributed/kv_transfer/kv_connector/factory.py
View file @
afd0da21
from
typing
import
TYPE_CHECKING
import
importlib
from
typing
import
TYPE_CHECKING
,
Callable
,
Dict
,
Type
from
.base
import
KVConnectorBase
from
.base
import
KVConnectorBase
...
@@ -7,14 +8,41 @@ if TYPE_CHECKING:
...
@@ -7,14 +8,41 @@ if TYPE_CHECKING:
class
KVConnectorFactory
:
class
KVConnectorFactory
:
_registry
:
Dict
[
str
,
Callable
[[],
Type
[
KVConnectorBase
]]]
=
{}
@
staticmethod
@
classmethod
def
create_connector
(
rank
:
int
,
local_rank
:
int
,
def
register_connector
(
cls
,
name
:
str
,
module_path
:
str
,
class_name
:
str
)
->
None
:
"""Register a connector with a lazy-loading module and class name."""
if
name
in
cls
.
_registry
:
raise
ValueError
(
f
"Connector '
{
name
}
' is already registered."
)
def
loader
()
->
Type
[
KVConnectorBase
]:
module
=
importlib
.
import_module
(
module_path
)
return
getattr
(
module
,
class_name
)
cls
.
_registry
[
name
]
=
loader
@
classmethod
def
create_connector
(
cls
,
rank
:
int
,
local_rank
:
int
,
config
:
"VllmConfig"
)
->
KVConnectorBase
:
config
:
"VllmConfig"
)
->
KVConnectorBase
:
supported_kv_connector
=
[
"PyNcclConnector"
,
"MooncakeConnector"
]
connector_name
=
config
.
kv_transfer_config
.
kv_connector
if
config
.
kv_transfer_config
.
kv_connector
in
supported_kv_connector
:
if
connector_name
not
in
cls
.
_registry
:
from
.simple_connector
import
SimpleConnector
raise
ValueError
(
f
"Unsupported connector type:
{
connector_name
}
"
)
return
SimpleConnector
(
rank
,
local_rank
,
config
)
else
:
connector_cls
=
cls
.
_registry
[
connector_name
]()
raise
ValueError
(
f
"Unsupported connector type: "
return
connector_cls
(
rank
,
local_rank
,
config
)
f
"
{
config
.
kv_connector
}
"
)
# Register various connectors here.
# The registration should not be done in each individual file, as we want to
# only load the files corresponding to the current connector.
KVConnectorFactory
.
register_connector
(
"PyNcclConnector"
,
"vllm.distributed.kv_transfer.kv_connector.simple_connector"
,
"SimpleConnector"
)
KVConnectorFactory
.
register_connector
(
"MooncakeConnector"
,
"vllm.distributed.kv_transfer.kv_connector.simple_connector"
,
"SimpleConnector"
)
vllm/distributed/kv_transfer/kv_connector/simple_connector.py
View file @
afd0da21
...
@@ -35,6 +35,7 @@ class SimpleConnector(KVConnectorBase):
...
@@ -35,6 +35,7 @@ class SimpleConnector(KVConnectorBase):
):
):
self
.
config
=
config
.
kv_transfer_config
self
.
config
=
config
.
kv_transfer_config
self
.
tp_size
=
config
.
parallel_config
.
tensor_parallel_size
if
self
.
config
.
kv_connector
==
"PyNcclConnector"
:
if
self
.
config
.
kv_connector
==
"PyNcclConnector"
:
from
vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe
import
(
from
vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe
import
(
...
@@ -161,7 +162,7 @@ class SimpleConnector(KVConnectorBase):
...
@@ -161,7 +162,7 @@ class SimpleConnector(KVConnectorBase):
end_layer
=
model_executable
.
model
.
end_layer
end_layer
=
model_executable
.
model
.
end_layer
model_config
=
model_executable
.
model
.
config
model_config
=
model_executable
.
model
.
config
num_heads
=
model_config
.
num_key_value_heads
num_heads
=
int
(
model_config
.
num_key_value_heads
/
self
.
tp_size
)
hidden_size
=
model_config
.
hidden_size
hidden_size
=
model_config
.
hidden_size
num_attention_heads
=
model_config
.
num_attention_heads
num_attention_heads
=
model_config
.
num_attention_heads
head_size
=
int
(
hidden_size
/
num_attention_heads
)
head_size
=
int
(
hidden_size
/
num_attention_heads
)
...
...
vllm/distributed/parallel_state.py
View file @
afd0da21
...
@@ -39,7 +39,6 @@ import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
...
@@ -39,7 +39,6 @@ import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.distributed.utils
import
StatelessProcessGroup
from
vllm.distributed.utils
import
StatelessProcessGroup
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.utils
import
direct_register_custom_op
,
supports_custom_op
from
vllm.utils
import
direct_register_custom_op
,
supports_custom_op
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
...
@@ -194,6 +193,7 @@ class GroupCoordinator:
...
@@ -194,6 +193,7 @@ class GroupCoordinator:
assert
self
.
cpu_group
is
not
None
assert
self
.
cpu_group
is
not
None
assert
self
.
device_group
is
not
None
assert
self
.
device_group
is
not
None
from
vllm.platforms
import
current_platform
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
self
.
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
self
.
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
else
:
else
:
...
@@ -305,15 +305,7 @@ class GroupCoordinator:
...
@@ -305,15 +305,7 @@ class GroupCoordinator:
stream
.
wait_stream
(
curr_stream
)
stream
.
wait_stream
(
curr_stream
)
with
torch
.
cuda
.
stream
(
stream
),
maybe_ca_context
:
with
torch
.
cuda
.
stream
(
stream
),
maybe_ca_context
:
pynccl_comm
=
self
.
pynccl_comm
yield
graph_capture_context
maybe_pynccl_context
:
Any
if
not
pynccl_comm
:
maybe_pynccl_context
=
nullcontext
()
else
:
maybe_pynccl_context
=
pynccl_comm
.
change_state
(
stream
=
torch
.
cuda
.
current_stream
())
with
maybe_pynccl_context
:
yield
graph_capture_context
def
all_reduce
(
self
,
input_
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
all_reduce
(
self
,
input_
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
"""
...
@@ -365,10 +357,7 @@ class GroupCoordinator:
...
@@ -365,10 +357,7 @@ class GroupCoordinator:
return
out
return
out
pynccl_comm
=
self
.
pynccl_comm
pynccl_comm
=
self
.
pynccl_comm
assert
pynccl_comm
is
not
None
assert
pynccl_comm
is
not
None
# TODO: pynccl should not use `stream=`
out
=
pynccl_comm
.
all_reduce
(
input_
)
# it can just always use the current stream.
out
=
pynccl_comm
.
all_reduce
(
input_
,
stream
=
torch
.
cuda
.
current_stream
())
if
out
is
None
:
if
out
is
None
:
# fall back to the default all-reduce using PyTorch.
# fall back to the default all-reduce using PyTorch.
# this usually happens during testing.
# this usually happens during testing.
...
@@ -873,12 +862,14 @@ def init_model_parallel_group(
...
@@ -873,12 +862,14 @@ def init_model_parallel_group(
)
->
GroupCoordinator
:
)
->
GroupCoordinator
:
if
use_custom_allreduce
is
None
:
if
use_custom_allreduce
is
None
:
use_custom_allreduce
=
_ENABLE_CUSTOM_ALL_REDUCE
use_custom_allreduce
=
_ENABLE_CUSTOM_ALL_REDUCE
from
vllm.platforms
import
current_platform
return
GroupCoordinator
(
return
GroupCoordinator
(
group_ranks
=
group_ranks
,
group_ranks
=
group_ranks
,
local_rank
=
local_rank
,
local_rank
=
local_rank
,
torch_distributed_backend
=
backend
,
torch_distributed_backend
=
backend
,
use_pynccl
=
True
,
use_pynccl
=
current_platform
.
is_cuda_alike
(),
use_custom_allreduce
=
use_custom_allreduce
,
use_custom_allreduce
=
current_platform
.
is_cuda_alike
()
and
use_custom_allreduce
,
use_tpu_communicator
=
True
,
use_tpu_communicator
=
True
,
use_hpu_communicator
=
True
,
use_hpu_communicator
=
True
,
use_xpu_communicator
=
True
,
use_xpu_communicator
=
True
,
...
@@ -920,7 +911,7 @@ def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
...
@@ -920,7 +911,7 @@ def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
@
contextmanager
@
contextmanager
def
graph_capture
():
def
graph_capture
(
device
:
torch
.
device
):
"""
"""
`graph_capture` is a context manager which should surround the code that
`graph_capture` is a context manager which should surround the code that
is capturing the CUDA graph. Its main purpose is to ensure that the
is capturing the CUDA graph. Its main purpose is to ensure that the
...
@@ -934,8 +925,9 @@ def graph_capture():
...
@@ -934,8 +925,9 @@ def graph_capture():
in order to explicitly distinguish the kernels to capture
in order to explicitly distinguish the kernels to capture
from other kernels possibly launched on background in the default stream.
from other kernels possibly launched on background in the default stream.
"""
"""
with
get_tp_group
().
graph_capture
()
as
context
,
get_pp_group
(
context
=
GraphCaptureContext
(
torch
.
cuda
.
Stream
(
device
=
device
))
).
graph_capture
(
context
):
with
get_tp_group
().
graph_capture
(
context
),
get_pp_group
().
graph_capture
(
context
):
yield
context
yield
context
...
@@ -1022,8 +1014,8 @@ def initialize_model_parallel(
...
@@ -1022,8 +1014,8 @@ def initialize_model_parallel(
backend
=
backend
or
torch
.
distributed
.
get_backend
(
backend
=
backend
or
torch
.
distributed
.
get_backend
(
get_world_group
().
device_group
)
get_world_group
().
device_group
)
if
(
world_size
!=
if
(
world_size
tensor_model_parallel_size
*
pipeline_model_parallel_size
):
!=
tensor_model_parallel_size
*
pipeline_model_parallel_size
):
raise
RuntimeError
(
raise
RuntimeError
(
f
"world_size (
{
world_size
}
) is not equal to "
f
"world_size (
{
world_size
}
) is not equal to "
f
"tensor_model_parallel_size (
{
tensor_model_parallel_size
}
) x "
f
"tensor_model_parallel_size (
{
tensor_model_parallel_size
}
) x "
...
@@ -1077,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
...
@@ -1077,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
return
return
if
all
([
if
all
([
vllm_config
.
kv_transfer_config
.
need_kv_parallel_group
,
vllm_config
.
kv_transfer_config
.
need_kv_parallel_group
,
_KV_TRANSFER
_KV_TRANSFER
is
None
is
None
]):
]):
_KV_TRANSFER
=
kv_transfer
.
KVTransferAgent
(
_KV_TRANSFER
=
kv_transfer
.
KVTransferAgent
(
rank
=
get_world_group
().
rank
,
rank
=
get_world_group
().
rank
,
...
@@ -1188,8 +1180,14 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
...
@@ -1188,8 +1180,14 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
import
ray
# Lazy import Ray
import
ray
# Lazy import Ray
ray
.
shutdown
()
ray
.
shutdown
()
gc
.
collect
()
gc
.
collect
()
from
vllm.platforms
import
current_platform
if
not
current_platform
.
is_cpu
():
if
not
current_platform
.
is_cpu
():
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
try
:
torch
.
_C
.
_host_emptyCache
()
except
AttributeError
:
logger
.
warning
(
"torch._C._host_emptyCache() only available in Pytorch >=2.5"
)
def
in_the_same_node_as
(
pg
:
Union
[
ProcessGroup
,
StatelessProcessGroup
],
def
in_the_same_node_as
(
pg
:
Union
[
ProcessGroup
,
StatelessProcessGroup
],
...
...
vllm/engine/arg_utils.py
View file @
afd0da21
...
@@ -18,7 +18,6 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
...
@@ -18,7 +18,6 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.platforms
import
current_platform
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
,
StoreBoolean
from
vllm.utils
import
FlexibleArgumentParser
,
StoreBoolean
...
@@ -99,10 +98,8 @@ class EngineArgs:
...
@@ -99,10 +98,8 @@ class EngineArgs:
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
dtype
:
str
=
'auto'
dtype
:
str
=
'auto'
kv_cache_dtype
:
str
=
'auto'
kv_cache_dtype
:
str
=
'auto'
quantization_param_path
:
Optional
[
str
]
=
None
seed
:
int
=
0
seed
:
int
=
0
max_model_len
:
Optional
[
int
]
=
None
max_model_len
:
Optional
[
int
]
=
None
worker_use_ray
:
bool
=
False
# Note: Specifying a custom executor backend by passing a class
# Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without
# is intended for expert use only. The API may change without
# notice.
# notice.
...
@@ -201,6 +198,10 @@ class EngineArgs:
...
@@ -201,6 +198,10 @@ class EngineArgs:
kv_transfer_config
:
Optional
[
KVTransferConfig
]
=
None
kv_transfer_config
:
Optional
[
KVTransferConfig
]
=
None
generation_config
:
Optional
[
str
]
=
None
generation_config
:
Optional
[
str
]
=
None
override_generation_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
enable_sleep_mode
:
bool
=
False
calculate_kv_scales
:
Optional
[
bool
]
=
None
def
__post_init__
(
self
):
def
__post_init__
(
self
):
if
not
self
.
tokenizer
:
if
not
self
.
tokenizer
:
...
@@ -242,7 +243,7 @@ class EngineArgs:
...
@@ -242,7 +243,7 @@ class EngineArgs:
choices
=
get_args
(
TaskOption
),
choices
=
get_args
(
TaskOption
),
help
=
'The task to use the model for. Each vLLM instance only '
help
=
'The task to use the model for. Each vLLM instance only '
'supports one task, even if the same model can be used for '
'supports one task, even if the same model can be used for '
'multiple tasks. When the model only supports one task, "auto" '
'multiple tasks. When the model only supports one task,
``
"auto"
``
'
'can be used to select it; otherwise, you must specify explicitly '
'can be used to select it; otherwise, you must specify explicitly '
'which task to use.'
)
'which task to use.'
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -254,7 +255,7 @@ class EngineArgs:
...
@@ -254,7 +255,7 @@ class EngineArgs:
parser
.
add_argument
(
parser
.
add_argument
(
'--skip-tokenizer-init'
,
'--skip-tokenizer-init'
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
'Skip initialization of tokenizer and detokenizer'
)
help
=
'Skip initialization of tokenizer and detokenizer
.
'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--revision'
,
'--revision'
,
type
=
nullable_str
,
type
=
nullable_str
,
...
@@ -352,18 +353,7 @@ class EngineArgs:
...
@@ -352,18 +353,7 @@ class EngineArgs:
default
=
EngineArgs
.
kv_cache_dtype
,
default
=
EngineArgs
.
kv_cache_dtype
,
help
=
'Data type for kv cache storage. If "auto", will use model '
help
=
'Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (hcu) supports fp8 (=fp8_e4m3)'
)
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)'
)
parser
.
add_argument
(
'--quantization-param-path'
,
type
=
nullable_str
,
default
=
None
,
help
=
'Path to the JSON file containing the KV cache '
'scaling factors. This should generally be supplied, when '
'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
'default to 1.0, which may cause accuracy issues. '
'FP8_E5M2 (without scaling) is only supported on cuda version '
'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
'supported for common inference criteria.'
)
parser
.
add_argument
(
'--max-model-len'
,
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
type
=
int
,
default
=
EngineArgs
.
max_model_len
,
default
=
EngineArgs
.
max_model_len
,
...
@@ -392,7 +382,7 @@ class EngineArgs:
...
@@ -392,7 +382,7 @@ class EngineArgs:
# Parallel arguments
# Parallel arguments
parser
.
add_argument
(
parser
.
add_argument
(
'--distributed-executor-backend'
,
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
choices
=
[
'ray'
,
'mp'
,
'uni'
,
'external_launcher'
],
default
=
EngineArgs
.
distributed_executor_backend
,
default
=
EngineArgs
.
distributed_executor_backend
,
help
=
'Backend to use for distributed model '
help
=
'Backend to use for distributed model '
'workers, either "ray" or "mp" (multiprocessing). If the product '
'workers, either "ray" or "mp" (multiprocessing). If the product '
...
@@ -400,12 +390,8 @@ class EngineArgs:
...
@@ -400,12 +390,8 @@ class EngineArgs:
'or equal to the number of GPUs available, "mp" will be used to '
'or equal to the number of GPUs available, "mp" will be used to '
'keep processing on a single host. Otherwise, this will default '
'keep processing on a single host. Otherwise, this will default '
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
'
and hpu
only support Ray for distributed inference.'
)
'only support
s
Ray for distributed inference.'
)
parser
.
add_argument
(
'--worker-use-ray'
,
action
=
'store_true'
,
help
=
'Deprecated, use --distributed-executor-backend=ray.'
)
parser
.
add_argument
(
'--pipeline-parallel-size'
,
parser
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
'-pp'
,
type
=
int
,
type
=
int
,
...
@@ -434,7 +420,7 @@ class EngineArgs:
...
@@ -434,7 +420,7 @@ class EngineArgs:
choices
=
[
8
,
16
,
32
,
64
,
128
],
choices
=
[
8
,
16
,
32
,
64
,
128
],
help
=
'Token block size for contiguous chunks of '
help
=
'Token block size for contiguous chunks of '
'tokens. This is ignored on neuron devices and '
'tokens. This is ignored on neuron devices and '
'set to max-model-len. On CUDA devices, '
'set to
``--
max-model-len
``
. On CUDA devices, '
'only block sizes up to 32 are supported. '
'only block sizes up to 32 are supported. '
'On HPU devices, block size defaults to 128.'
)
'On HPU devices, block size defaults to 128.'
)
...
@@ -443,12 +429,12 @@ class EngineArgs:
...
@@ -443,12 +429,12 @@ class EngineArgs:
action
=
argparse
.
BooleanOptionalAction
,
action
=
argparse
.
BooleanOptionalAction
,
default
=
EngineArgs
.
enable_prefix_caching
,
default
=
EngineArgs
.
enable_prefix_caching
,
help
=
"Enables automatic prefix caching. "
help
=
"Enables automatic prefix caching. "
"Use --no-enable-prefix-caching to disable explicitly."
,
"Use
``
--no-enable-prefix-caching
``
to disable explicitly."
,
)
)
parser
.
add_argument
(
'--disable-sliding-window'
,
parser
.
add_argument
(
'--disable-sliding-window'
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
'Disables sliding window, '
help
=
'Disables sliding window, '
'capping to sliding window size'
)
'capping to sliding window size
.
'
)
parser
.
add_argument
(
'--use-v2-block-manager'
,
parser
.
add_argument
(
'--use-v2-block-manager'
,
action
=
'store_true'
,
action
=
'store_true'
,
default
=
True
,
default
=
True
,
...
@@ -542,7 +528,7 @@ class EngineArgs:
...
@@ -542,7 +528,7 @@ class EngineArgs:
default
=
None
,
default
=
None
,
type
=
json
.
loads
,
type
=
json
.
loads
,
help
=
'RoPE scaling configuration in JSON format. '
help
=
'RoPE scaling configuration in JSON format. '
'For example, {"rope_type":"dynamic","factor":2.0}'
)
'For example,
``
{"rope_type":"dynamic","factor":2.0}
``
'
)
parser
.
add_argument
(
'--rope-theta'
,
parser
.
add_argument
(
'--rope-theta'
,
default
=
None
,
default
=
None
,
type
=
float
,
type
=
float
,
...
@@ -611,7 +597,7 @@ class EngineArgs:
...
@@ -611,7 +597,7 @@ class EngineArgs:
default
=
None
,
default
=
None
,
type
=
json
.
loads
,
type
=
json
.
loads
,
help
=
(
'Overrides for the multimodal input mapping/processing, '
help
=
(
'Overrides for the multimodal input mapping/processing, '
'e.g., image processor. For example: {"num_crops": 4}.'
))
'e.g., image processor. For example:
``
{"num_crops": 4}
``
.'
))
parser
.
add_argument
(
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
action
=
'store_true'
,
...
@@ -879,7 +865,7 @@ class EngineArgs:
...
@@ -879,7 +865,7 @@ class EngineArgs:
"of the provided names. The model name in the model "
"of the provided names. The model name in the model "
"field of a response will be the first name in this "
"field of a response will be the first name in this "
"list. If not specified, the model name will be the "
"list. If not specified, the model name will be the "
"same as the `--model` argument. Noted that this name(s) "
"same as the
`
`--model`
`
argument. Noted that this name(s) "
"will also be used in `model_name` tag content of "
"will also be used in `model_name` tag content of "
"prometheus metrics, if multiple names provided, metrics "
"prometheus metrics, if multiple names provided, metrics "
"tag will take the first one."
)
"tag will take the first one."
)
...
@@ -899,7 +885,7 @@ class EngineArgs:
...
@@ -899,7 +885,7 @@ class EngineArgs:
default
=
None
,
default
=
None
,
help
=
"Valid choices are "
+
help
=
"Valid choices are "
+
","
.
join
(
ALLOWED_DETAILED_TRACE_MODULES
)
+
","
.
join
(
ALLOWED_DETAILED_TRACE_MODULES
)
+
". It makes sense to set this only if --otlp-traces-endpoint is"
". It makes sense to set this only if
``
--otlp-traces-endpoint
``
is"
" set. If set, it will collect detailed traces for the specified "
" set. If set, it will collect detailed traces for the specified "
"modules. This involves use of possibly costly and or blocking "
"modules. This involves use of possibly costly and or blocking "
"operations and hence might have a performance impact."
)
"operations and hence might have a performance impact."
)
...
@@ -926,13 +912,13 @@ class EngineArgs:
...
@@ -926,13 +912,13 @@ class EngineArgs:
type
=
json
.
loads
,
type
=
json
.
loads
,
default
=
None
,
default
=
None
,
help
=
"Override or set neuron device configuration. "
help
=
"Override or set neuron device configuration. "
"e.g. {
\"
cast_logits_dtype
\"
:
\"
bloat16
\"
}.
'
"
)
"e.g.
``
{
\"
cast_logits_dtype
\"
:
\"
bloat16
\"
}
``
."
)
parser
.
add_argument
(
parser
.
add_argument
(
'--override-pooler-config'
,
'--override-pooler-config'
,
type
=
PoolerConfig
.
from_json
,
type
=
PoolerConfig
.
from_json
,
default
=
None
,
default
=
None
,
help
=
"Override or set the pooling method for pooling models. "
help
=
"Override or set the pooling method for pooling models. "
"e.g. {
\"
pooling_type
\"
:
\"
mean
\"
,
\"
normalize
\"
: false}.
'
"
)
"e.g.
``
{
\"
pooling_type
\"
:
\"
mean
\"
,
\"
normalize
\"
: false}
``
."
)
parser
.
add_argument
(
'--compilation-config'
,
parser
.
add_argument
(
'--compilation-config'
,
'-O'
,
'-O'
,
...
@@ -962,16 +948,43 @@ class EngineArgs:
...
@@ -962,16 +948,43 @@ class EngineArgs:
type
=
str
,
type
=
str
,
default
=
"auto"
,
default
=
"auto"
,
help
=
'The worker class to use for distributed execution.'
)
help
=
'The worker class to use for distributed execution.'
)
parser
.
add_argument
(
parser
.
add_argument
(
"--generation-config"
,
"--generation-config"
,
type
=
nullable_str
,
type
=
nullable_str
,
default
=
None
,
default
=
None
,
help
=
"The folder path to the generation config. "
help
=
"The folder path to the generation config. "
"Defaults to None, will use the default generation config in vLLM. "
"Defaults to None, no generation config is loaded, vLLM defaults "
"If set to 'auto', the generation config will be automatically "
"will be used. If set to 'auto', the generation config will be "
"loaded from model. If set to a folder path, the generation config "
"loaded from model path. If set to a folder path, the generation "
"will be loaded from the specified folder path."
)
"config will be loaded from the specified folder path. If "
"`max_new_tokens` is specified in generation config, then "
"it sets a server-wide limit on the number of output tokens "
"for all requests."
)
parser
.
add_argument
(
"--override-generation-config"
,
type
=
json
.
loads
,
default
=
None
,
help
=
"Overrides or sets generation config in JSON format. "
"e.g. ``{
\"
temperature
\"
: 0.5}``. If used with "
"--generation-config=auto, the override parameters will be merged "
"with the default config from the model. If generation-config is "
"None, only the override parameters are used."
)
parser
.
add_argument
(
"--enable-sleep-mode"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Enable sleep mode for the engine. "
"(only cuda platform is supported)"
)
parser
.
add_argument
(
'--calculate-kv-scales'
,
action
=
'store_true'
,
help
=
'This enables dynamic calculation of '
'k_scale and v_scale when kv-cache-dtype is fp8. '
'If calculate-kv-scales is false, the scales will '
'be loaded from the model checkpoint if available. '
'Otherwise, the scales will default to 1.0.'
)
return
parser
return
parser
...
@@ -1002,7 +1015,6 @@ class EngineArgs:
...
@@ -1002,7 +1015,6 @@ class EngineArgs:
tokenizer_revision
=
self
.
tokenizer_revision
,
tokenizer_revision
=
self
.
tokenizer_revision
,
max_model_len
=
self
.
max_model_len
,
max_model_len
=
self
.
max_model_len
,
quantization
=
self
.
quantization
,
quantization
=
self
.
quantization
,
quantization_param_path
=
self
.
quantization_param_path
,
enforce_eager
=
self
.
enforce_eager
,
enforce_eager
=
self
.
enforce_eager
,
max_seq_len_to_capture
=
self
.
max_seq_len_to_capture
,
max_seq_len_to_capture
=
self
.
max_seq_len_to_capture
,
max_logprobs
=
self
.
max_logprobs
,
max_logprobs
=
self
.
max_logprobs
,
...
@@ -1017,7 +1029,10 @@ class EngineArgs:
...
@@ -1017,7 +1029,10 @@ class EngineArgs:
override_neuron_config
=
self
.
override_neuron_config
,
override_neuron_config
=
self
.
override_neuron_config
,
override_pooler_config
=
self
.
override_pooler_config
,
override_pooler_config
=
self
.
override_pooler_config
,
logits_processor_pattern
=
self
.
logits_processor_pattern
,
logits_processor_pattern
=
self
.
logits_processor_pattern
,
generation_config
=
self
.
generation_config
)
generation_config
=
self
.
generation_config
,
override_generation_config
=
self
.
override_generation_config
,
enable_sleep_mode
=
self
.
enable_sleep_mode
,
)
def
create_load_config
(
self
)
->
LoadConfig
:
def
create_load_config
(
self
)
->
LoadConfig
:
return
LoadConfig
(
return
LoadConfig
(
...
@@ -1077,11 +1092,11 @@ class EngineArgs:
...
@@ -1077,11 +1092,11 @@ class EngineArgs:
sliding_window
=
model_config
.
get_sliding_window
(),
sliding_window
=
model_config
.
get_sliding_window
(),
enable_prefix_caching
=
self
.
enable_prefix_caching
,
enable_prefix_caching
=
self
.
enable_prefix_caching
,
cpu_offload_gb
=
self
.
cpu_offload_gb
,
cpu_offload_gb
=
self
.
cpu_offload_gb
,
calculate_kv_scales
=
self
.
calculate_kv_scales
,
)
)
parallel_config
=
ParallelConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
self
.
pipeline_parallel_size
,
pipeline_parallel_size
=
self
.
pipeline_parallel_size
,
tensor_parallel_size
=
self
.
tensor_parallel_size
,
tensor_parallel_size
=
self
.
tensor_parallel_size
,
worker_use_ray
=
self
.
worker_use_ray
,
max_parallel_loading_workers
=
self
.
max_parallel_loading_workers
,
max_parallel_loading_workers
=
self
.
max_parallel_loading_workers
,
disable_custom_all_reduce
=
self
.
disable_custom_all_reduce
,
disable_custom_all_reduce
=
self
.
disable_custom_all_reduce
,
tokenizer_pool_config
=
TokenizerPoolConfig
.
create_config
(
tokenizer_pool_config
=
TokenizerPoolConfig
.
create_config
(
...
@@ -1111,6 +1126,7 @@ class EngineArgs:
...
@@ -1111,6 +1126,7 @@ class EngineArgs:
use_sliding_window
=
(
model_config
.
get_sliding_window
()
use_sliding_window
=
(
model_config
.
get_sliding_window
()
is
not
None
)
is
not
None
)
use_spec_decode
=
self
.
speculative_model
is
not
None
use_spec_decode
=
self
.
speculative_model
is
not
None
from
vllm.platforms
import
current_platform
if
(
is_gpu
and
not
use_sliding_window
and
not
use_spec_decode
if
(
is_gpu
and
not
use_sliding_window
and
not
use_spec_decode
and
not
self
.
enable_lora
and
not
self
.
enable_lora
and
not
self
.
enable_prompt_adapter
and
not
self
.
enable_prompt_adapter
...
@@ -1166,7 +1182,7 @@ class EngineArgs:
...
@@ -1166,7 +1182,7 @@ class EngineArgs:
num_speculative_heads
=
self
.
num_speculative_heads
num_speculative_heads
=
self
.
num_speculative_heads
)
)
# Reminder: Please update docs/source/
usage
/compatibility_matrix.md
# Reminder: Please update docs/source/
features
/compatibility_matrix.md
# If the feature combo become valid
# If the feature combo become valid
if
self
.
num_scheduler_steps
>
1
:
if
self
.
num_scheduler_steps
>
1
:
if
speculative_config
is
not
None
:
if
speculative_config
is
not
None
:
...
@@ -1175,6 +1191,12 @@ class EngineArgs:
...
@@ -1175,6 +1191,12 @@ class EngineArgs:
if
self
.
enable_chunked_prefill
and
self
.
pipeline_parallel_size
>
1
:
if
self
.
enable_chunked_prefill
and
self
.
pipeline_parallel_size
>
1
:
raise
ValueError
(
"Multi-Step Chunked-Prefill is not supported "
raise
ValueError
(
"Multi-Step Chunked-Prefill is not supported "
"for pipeline-parallel-size > 1"
)
"for pipeline-parallel-size > 1"
)
from
vllm.platforms
import
current_platform
if
current_platform
.
is_cpu
():
logger
.
warning
(
"Multi-Step (--num-scheduler-steps > 1) is "
"currently not supported for CPUs and has been "
"disabled."
)
self
.
num_scheduler_steps
=
1
# make sure num_lookahead_slots is set the higher value depending on
# make sure num_lookahead_slots is set the higher value depending on
# if we are using speculative decoding or multi-step
# if we are using speculative decoding or multi-step
...
@@ -1285,11 +1307,22 @@ class EngineArgs:
...
@@ -1285,11 +1307,22 @@ class EngineArgs:
self
.
enable_chunked_prefill
=
True
self
.
enable_chunked_prefill
=
True
# When no user override, set the default values based on the usage
# When no user override, set the default values based on the usage
# context.
# context.
# TODO(woosuk): Tune the default values for different hardware.
# Use different default values for different hardware.
default_max_num_batched_tokens
=
{
from
vllm.platforms
import
current_platform
UsageContext
.
LLM_CLASS
:
8192
,
device_name
=
current_platform
.
get_device_name
().
lower
()
UsageContext
.
OPENAI_API_SERVER
:
2048
,
if
"h100"
in
device_name
or
"h200"
in
device_name
:
}
# For H100 and H200, we use larger default values.
default_max_num_batched_tokens
=
{
UsageContext
.
LLM_CLASS
:
16384
,
UsageContext
.
OPENAI_API_SERVER
:
8192
,
}
else
:
# TODO(woosuk): Tune the default values for other hardware.
default_max_num_batched_tokens
=
{
UsageContext
.
LLM_CLASS
:
8192
,
UsageContext
.
OPENAI_API_SERVER
:
2048
,
}
if
(
self
.
max_num_batched_tokens
is
None
if
(
self
.
max_num_batched_tokens
is
None
and
usage_context
in
default_max_num_batched_tokens
):
and
usage_context
in
default_max_num_batched_tokens
):
self
.
max_num_batched_tokens
=
default_max_num_batched_tokens
[
self
.
max_num_batched_tokens
=
default_max_num_batched_tokens
[
...
...
vllm/engine/async_llm_engine.py
View file @
afd0da21
...
@@ -18,9 +18,7 @@ from vllm.engine.async_timeout import asyncio_timeout
...
@@ -18,9 +18,7 @@ from vllm.engine.async_timeout import asyncio_timeout
from
vllm.engine.llm_engine
import
LLMEngine
,
SchedulerOutputState
from
vllm.engine.llm_engine
import
LLMEngine
,
SchedulerOutputState
from
vllm.engine.metrics_types
import
StatLoggerBase
from
vllm.engine.metrics_types
import
StatLoggerBase
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.executor.executor_base
import
ExecutorAsyncBase
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.executor.gpu_executor
import
GPUExecutorAsync
from
vllm.executor.ray_utils
import
initialize_ray_cluster
from
vllm.inputs
import
PromptType
from
vllm.inputs
import
PromptType
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -620,69 +618,9 @@ class AsyncLLMEngine(EngineClient):
...
@@ -620,69 +618,9 @@ class AsyncLLMEngine(EngineClient):
rt
.
new_requests_event
.
set
()
rt
.
new_requests_event
.
set
()
@
classmethod
@
classmethod
def
_get_executor_cls
(
def
_get_executor_cls
(
cls
,
cls
,
engine_config
:
VllmConfig
)
->
Type
[
ExecutorAsyncBase
]:
engine_config
:
VllmConfig
)
->
Type
[
ExecutorBase
]:
distributed_executor_backend
=
(
return
LLMEngine
.
_get_executor_cls
(
engine_config
)
engine_config
.
parallel_config
.
distributed_executor_backend
)
if
isinstance
(
distributed_executor_backend
,
type
):
if
not
issubclass
(
distributed_executor_backend
,
ExecutorAsyncBase
):
raise
TypeError
(
"distributed_executor_backend must be a subclass of "
f
"ExecutorAsyncBase. Got
{
distributed_executor_backend
}
."
)
executor_class
=
distributed_executor_backend
elif
engine_config
.
device_config
.
device_type
==
"neuron"
:
from
vllm.executor.neuron_executor
import
NeuronExecutorAsync
executor_class
=
NeuronExecutorAsync
elif
engine_config
.
device_config
.
device_type
==
"tpu"
:
if
distributed_executor_backend
==
"ray"
:
from
vllm.executor.ray_tpu_executor
import
RayTPUExecutorAsync
executor_class
=
RayTPUExecutorAsync
else
:
assert
distributed_executor_backend
is
None
from
vllm.executor.tpu_executor
import
TPUExecutorAsync
executor_class
=
TPUExecutorAsync
elif
engine_config
.
device_config
.
device_type
==
"cpu"
:
from
vllm.executor.cpu_executor
import
CPUExecutorAsync
executor_class
=
CPUExecutorAsync
elif
engine_config
.
device_config
.
device_type
==
"hpu"
:
if
distributed_executor_backend
==
"ray"
:
initialize_ray_cluster
(
engine_config
.
parallel_config
)
from
vllm.executor.ray_hpu_executor
import
RayHPUExecutorAsync
executor_class
=
RayHPUExecutorAsync
else
:
from
vllm.executor.hpu_executor
import
HPUExecutorAsync
executor_class
=
HPUExecutorAsync
elif
engine_config
.
device_config
.
device_type
==
"openvino"
:
assert
distributed_executor_backend
is
None
,
(
"Distributed execution is not supported with "
"the OpenVINO backend."
)
from
vllm.executor.openvino_executor
import
OpenVINOExecutorAsync
executor_class
=
OpenVINOExecutorAsync
elif
engine_config
.
device_config
.
device_type
==
"xpu"
:
if
distributed_executor_backend
is
None
:
from
vllm.executor.xpu_executor
import
XPUExecutorAsync
executor_class
=
XPUExecutorAsync
elif
distributed_executor_backend
==
"ray"
:
from
vllm.executor.ray_xpu_executor
import
RayXPUExecutorAsync
executor_class
=
RayXPUExecutorAsync
elif
distributed_executor_backend
==
"mp"
:
from
vllm.executor.multiproc_xpu_executor
import
(
MultiprocessingXPUExecutorAsync
)
executor_class
=
MultiprocessingXPUExecutorAsync
else
:
raise
RuntimeError
(
"Not supported distributed execution model on XPU device."
)
elif
distributed_executor_backend
==
"ray"
:
from
vllm.executor.ray_gpu_executor
import
RayGPUExecutorAsync
executor_class
=
RayGPUExecutorAsync
elif
distributed_executor_backend
==
"mp"
:
from
vllm.executor.multiproc_gpu_executor
import
(
MultiprocessingGPUExecutorAsync
)
executor_class
=
MultiprocessingGPUExecutorAsync
else
:
from
vllm.executor.gpu_executor
import
GPUExecutorAsync
executor_class
=
GPUExecutorAsync
return
executor_class
@
classmethod
@
classmethod
def
from_engine_args
(
def
from_engine_args
(
...
@@ -700,9 +638,6 @@ class AsyncLLMEngine(EngineClient):
...
@@ -700,9 +638,6 @@ class AsyncLLMEngine(EngineClient):
executor_class
=
cls
.
_get_executor_cls
(
engine_config
)
executor_class
=
cls
.
_get_executor_cls
(
engine_config
)
if
executor_class
.
uses_ray
:
initialize_ray_cluster
(
engine_config
.
parallel_config
)
# Create the async LLM engine.
# Create the async LLM engine.
engine
=
cls
(
engine
=
cls
(
vllm_config
=
engine_config
,
vllm_config
=
engine_config
,
...
@@ -1242,20 +1177,16 @@ class AsyncLLMEngine(EngineClient):
...
@@ -1242,20 +1177,16 @@ class AsyncLLMEngine(EngineClient):
self
.
engine
.
remove_logger
(
logger_name
=
logger_name
)
self
.
engine
.
remove_logger
(
logger_name
=
logger_name
)
async
def
start_profile
(
self
)
->
None
:
async
def
start_profile
(
self
)
->
None
:
# using type instead of isinstance to check to avoid capturing
self
.
engine
.
start_profile
()
# inherited classes
if
type
(
self
.
engine
.
model_executor
)
==
GPUExecutorAsync
:
# noqa: E721
self
.
engine
.
model_executor
.
start_profile
()
else
:
self
.
engine
.
model_executor
.
_run_workers
(
"start_profile"
)
async
def
stop_profile
(
self
)
->
None
:
async
def
stop_profile
(
self
)
->
None
:
# using type instead of isinstance to check to avoid capturing
self
.
engine
.
stop_profile
()
# inherited classes
if
type
(
self
.
engine
.
model_executor
)
==
GPUExecutorAsync
:
# noqa: E721
async
def
reset_prefix_cache
(
self
)
->
None
:
self
.
engine
.
model_executor
.
stop_profile
()
self
.
engine
.
reset_prefix_cache
()
else
:
self
.
engine
.
model_executor
.
_run_workers
(
"stop_profile"
)
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
self
.
engine
.
add_lora
(
lora_request
)
# TODO(v1): Remove this class proxy when V1 goes default.
# TODO(v1): Remove this class proxy when V1 goes default.
...
...
vllm/engine/llm_engine.py
View file @
afd0da21
...
@@ -29,8 +29,6 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group
...
@@ -29,8 +29,6 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group
from
vllm.entrypoints.openai.logits_processors
import
(
from
vllm.entrypoints.openai.logits_processors
import
(
get_logits_processors
as
get_openai_logits_processors
)
get_logits_processors
as
get_openai_logits_processors
)
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.executor.gpu_executor
import
GPUExecutor
from
vllm.executor.ray_utils
import
initialize_ray_cluster
from
vllm.inputs
import
(
INPUT_REGISTRY
,
InputRegistry
,
ProcessorInputs
,
from
vllm.inputs
import
(
INPUT_REGISTRY
,
InputRegistry
,
ProcessorInputs
,
PromptType
,
SingletonInputsAdapter
)
PromptType
,
SingletonInputsAdapter
)
from
vllm.inputs.parse
import
is_encoder_decoder_inputs
,
is_token_prompt
from
vllm.inputs.parse
import
is_encoder_decoder_inputs
,
is_token_prompt
...
@@ -233,7 +231,7 @@ class LLMEngine:
...
@@ -233,7 +231,7 @@ class LLMEngine:
)
)
logger
.
info
(
logger
.
info
(
"Initializing a
n
LLM engine (v%s) with config: %s, "
"Initializing a
V0
LLM engine (v%s) with config: %s, "
"use_cached_outputs=%s, "
,
"use_cached_outputs=%s, "
,
VLLM_VERSION
,
VLLM_VERSION
,
vllm_config
,
vllm_config
,
...
@@ -445,64 +443,31 @@ class LLMEngine:
...
@@ -445,64 +443,31 @@ class LLMEngine:
raise
TypeError
(
raise
TypeError
(
"distributed_executor_backend must be a subclass of "
"distributed_executor_backend must be a subclass of "
f
"ExecutorBase. Got
{
distributed_executor_backend
}
."
)
f
"ExecutorBase. Got
{
distributed_executor_backend
}
."
)
if
distributed_executor_backend
.
uses_ray
:
# type: ignore
initialize_ray_cluster
(
engine_config
.
parallel_config
)
executor_class
=
distributed_executor_backend
executor_class
=
distributed_executor_backend
elif
engine_config
.
device_config
.
device_type
==
"neuron"
:
elif
engine_config
.
parallel_config
.
world_size
>
1
:
from
vllm.executor.neuron_executor
import
NeuronExecutor
executor_class
=
NeuronExecutor
elif
engine_config
.
device_config
.
device_type
==
"tpu"
:
if
distributed_executor_backend
==
"ray"
:
if
distributed_executor_backend
==
"ray"
:
initialize_ray_cluster
(
engine_config
.
parallel_config
)
from
vllm.executor.ray_distributed_executor
import
(
from
vllm.executor.ray_tpu_executor
import
RayTPUExecutor
RayDistributedExecutor
)
executor_class
=
RayTPUExecutor
executor_class
=
RayDistributedExecutor
else
:
assert
distributed_executor_backend
is
None
from
vllm.executor.tpu_executor
import
TPUExecutor
executor_class
=
TPUExecutor
elif
engine_config
.
device_config
.
device_type
==
"cpu"
:
from
vllm.executor.cpu_executor
import
CPUExecutor
executor_class
=
CPUExecutor
elif
engine_config
.
device_config
.
device_type
==
"hpu"
:
if
distributed_executor_backend
==
"ray"
:
initialize_ray_cluster
(
engine_config
.
parallel_config
)
from
vllm.executor.ray_hpu_executor
import
RayHPUExecutor
executor_class
=
RayHPUExecutor
else
:
from
vllm.executor.hpu_executor
import
HPUExecutor
executor_class
=
HPUExecutor
elif
engine_config
.
device_config
.
device_type
==
"openvino"
:
from
vllm.executor.openvino_executor
import
OpenVINOExecutor
executor_class
=
OpenVINOExecutor
elif
engine_config
.
device_config
.
device_type
==
"xpu"
:
if
distributed_executor_backend
==
"ray"
:
initialize_ray_cluster
(
engine_config
.
parallel_config
)
from
vllm.executor.ray_xpu_executor
import
RayXPUExecutor
executor_class
=
RayXPUExecutor
elif
distributed_executor_backend
==
"mp"
:
elif
distributed_executor_backend
==
"mp"
:
# FIXME(kunshang):
from
vllm.executor.mp_distributed_executor
import
(
# spawn needs calling `if __name__ == '__main__':``
MultiprocessingDistributedExecutor
)
# fork is not supported for xpu start new process.
assert
not
envs
.
VLLM_USE_RAY_SPMD_WORKER
,
(
logger
.
error
(
"multiprocessing distributed executor backend does not "
"Both start methods (spawn and fork) have issue "
"support VLLM_USE_RAY_SPMD_WORKER=1"
)
"on XPU if you use mp backend, Please try ray instead."
)
executor_class
=
MultiprocessingDistributedExecutor
else
:
elif
distributed_executor_backend
==
"uni"
:
from
vllm.executor.xpu_executor
import
XPUExecutor
# JAX-style, single-process, multi-device executor.
executor_class
=
XPUExecutor
from
vllm.executor.uniproc_executor
import
UniProcExecutor
elif
distributed_executor_backend
==
"ray"
:
executor_class
=
UniProcExecutor
initialize_ray_cluster
(
engine_config
.
parallel_config
)
elif
distributed_executor_backend
==
"external_launcher"
:
from
vllm.executor.ray_gpu_executor
import
RayGPUExecutor
# executor with external launcher
executor_class
=
RayGPUExecutor
from
vllm.executor.uniproc_executor
import
(
# noqa
elif
distributed_executor_backend
==
"mp"
:
ExecutorWithExternalLauncher
)
from
vllm.executor.multiproc_gpu_executor
import
(
executor_class
=
ExecutorWithExternalLauncher
MultiprocessingGPUExecutor
)
assert
not
envs
.
VLLM_USE_RAY_SPMD_WORKER
,
(
"multiprocessing distributed executor backend does not "
"support VLLM_USE_RAY_SPMD_WORKER=1"
)
executor_class
=
MultiprocessingGPUExecutor
else
:
else
:
from
vllm.executor.
gpu
_executor
import
GPU
Executor
from
vllm.executor.
uniproc
_executor
import
UniProc
Executor
executor_class
=
GPU
Executor
executor_class
=
UniProc
Executor
return
executor_class
return
executor_class
@
classmethod
@
classmethod
...
@@ -727,7 +692,9 @@ class LLMEngine:
...
@@ -727,7 +692,9 @@ class LLMEngine:
:class:`~vllm.PoolingParams` for pooling.
:class:`~vllm.PoolingParams` for pooling.
arrival_time: The arrival time of the request. If None, we use
arrival_time: The arrival time of the request. If None, we use
the current monotonic time.
the current monotonic time.
lora_request: The LoRA request to add.
trace_headers: OpenTelemetry trace headers.
trace_headers: OpenTelemetry trace headers.
prompt_adapter_request: The prompt adapter request to add.
priority: The priority of the request.
priority: The priority of the request.
Only applicable with priority scheduling.
Only applicable with priority scheduling.
...
@@ -950,6 +917,14 @@ class LLMEngine:
...
@@ -950,6 +917,14 @@ class LLMEngine:
"""
"""
return
self
.
scheduler
[
virtual_engine
].
has_unfinished_seqs
()
return
self
.
scheduler
[
virtual_engine
].
has_unfinished_seqs
()
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache for all devices."""
success
=
True
for
scheduler
in
self
.
scheduler
:
success
=
success
and
scheduler
.
reset_prefix_cache
()
return
success
@
staticmethod
@
staticmethod
def
_process_sequence_group_outputs
(
def
_process_sequence_group_outputs
(
seq_group
:
SequenceGroup
,
seq_group
:
SequenceGroup
,
...
@@ -1038,8 +1013,23 @@ class LLMEngine:
...
@@ -1038,8 +1013,23 @@ class LLMEngine:
self
.
speculative_config
self
.
speculative_config
# Organize outputs by [step][sequence group] instead of
# Organize outputs by [step][sequence group] instead of
# [sequence group][step].
# [sequence group][step].
outputs_by_sequence_group
=
create_output_by_sequence_group
(
if
self
.
scheduler_config
.
is_multi_step
:
outputs
,
num_seq_groups
=
len
(
seq_group_metadata_list
))
outputs_by_sequence_group
=
create_output_by_sequence_group
(
outputs
,
len
(
seq_group_metadata_list
))
elif
self
.
speculative_config
:
# Decodes are multi-steps while prefills are not, outputting at
# most 1 token. Separate them so that we can trigger chunk
# processing without having to pad or copy over prompts K times
# to match decodes structure (costly with prompt_logprobs).
num_prefills
=
sum
(
sg
.
is_prompt
for
sg
in
seq_group_metadata_list
)
prefills
,
decodes
=
outputs
[:
num_prefills
],
outputs
[
num_prefills
:]
outputs_by_sequence_group
=
create_output_by_sequence_group
(
decodes
,
num_seq_groups
=
len
(
seq_group_metadata_list
)
-
num_prefills
)
outputs_by_sequence_group
=
[
p
.
outputs
for
p
in
prefills
]
+
outputs_by_sequence_group
# We have outputs for multiple steps submitted in a single burst,
# We have outputs for multiple steps submitted in a single burst,
# so invalidate is_first_step_output.
# so invalidate is_first_step_output.
is_first_step_output
=
None
is_first_step_output
=
None
...
@@ -1141,6 +1131,8 @@ class LLMEngine:
...
@@ -1141,6 +1131,8 @@ class LLMEngine:
seq_group
=
scheduled_seq_group
.
seq_group
seq_group
=
scheduled_seq_group
.
seq_group
seq_group
.
maybe_set_first_token_time
(
now
)
seq_group
.
maybe_set_first_token_time
(
now
)
if
not
seq_group
.
is_prefill
():
seq_group
.
set_last_token_time
(
now
)
request_output
=
RequestOutputFactory
.
create
(
request_output
=
RequestOutputFactory
.
create
(
seq_group
,
seq_group
,
self
.
seq_id_to_seq_group
,
self
.
seq_id_to_seq_group
,
...
@@ -1183,6 +1175,8 @@ class LLMEngine:
...
@@ -1183,6 +1175,8 @@ class LLMEngine:
seq_group
=
scheduled_seq_group
.
seq_group
seq_group
=
scheduled_seq_group
.
seq_group
seq_group
.
maybe_set_first_token_time
(
now
)
seq_group
.
maybe_set_first_token_time
(
now
)
if
not
seq_group
.
is_prefill
():
seq_group
.
set_last_token_time
(
now
)
request_output
=
RequestOutputFactory
.
create
(
request_output
=
RequestOutputFactory
.
create
(
seq_group
,
seq_group
,
self
.
seq_id_to_seq_group
,
self
.
seq_id_to_seq_group
,
...
@@ -1703,7 +1697,7 @@ class LLMEngine:
...
@@ -1703,7 +1697,7 @@ class LLMEngine:
# If the seq_group just finished the prefill state
# If the seq_group just finished the prefill state
# get TTFT.
# get TTFT.
if
not
seq_group
.
is_prefill
():
if
not
seq_group
.
is_prefill
():
latency
=
seq_group
.
get_last_latency
(
now
)
latency
=
seq_group
.
get_last_
token_
latency
()
time_to_first_tokens_iter
.
append
(
latency
)
time_to_first_tokens_iter
.
append
(
latency
)
# One generation token per finished prefill.
# One generation token per finished prefill.
...
@@ -1711,7 +1705,7 @@ class LLMEngine:
...
@@ -1711,7 +1705,7 @@ class LLMEngine:
seq_group
.
num_seqs
())
seq_group
.
num_seqs
())
else
:
else
:
# TPOTs.
# TPOTs.
latency
=
seq_group
.
get_last_latency
(
now
)
latency
=
seq_group
.
get_last_
token_
latency
()
time_per_output_tokens_iter
.
append
(
latency
)
time_per_output_tokens_iter
.
append
(
latency
)
if
seq_group
.
state
.
current_step
==
0
:
if
seq_group
.
state
.
current_step
==
0
:
# For async_output_proc, the do_log_stats()
# For async_output_proc, the do_log_stats()
...
@@ -1858,27 +1852,27 @@ class LLMEngine:
...
@@ -1858,27 +1852,27 @@ class LLMEngine:
def
list_prompt_adapters
(
self
)
->
List
[
int
]:
def
list_prompt_adapters
(
self
)
->
List
[
int
]:
return
self
.
model_executor
.
list_prompt_adapters
()
return
self
.
model_executor
.
list_prompt_adapters
()
def
start_profile
(
self
)
->
None
:
self
.
model_executor
.
start_profile
()
def
stop_profile
(
self
)
->
None
:
self
.
model_executor
.
stop_profile
()
def
sleep
(
self
,
level
:
int
=
1
)
->
None
:
assert
self
.
vllm_config
.
model_config
.
enable_sleep_mode
,
(
"Sleep mode is not enabled in the model config"
)
self
.
model_executor
.
sleep
(
level
=
level
)
def
wake_up
(
self
)
->
None
:
assert
self
.
vllm_config
.
model_config
.
enable_sleep_mode
,
(
"Sleep mode is not enabled in the model config"
)
self
.
model_executor
.
wake_up
()
def
check_health
(
self
)
->
None
:
def
check_health
(
self
)
->
None
:
if
self
.
tokenizer
:
if
self
.
tokenizer
:
self
.
tokenizer
.
check_health
()
self
.
tokenizer
.
check_health
()
self
.
model_executor
.
check_health
()
self
.
model_executor
.
check_health
()
def
start_profile
(
self
)
->
None
:
# using type instead of isinstance to check to avoid capturing
# inherited classes (MultiprocessingGPUExecutor)
if
type
(
self
.
model_executor
)
==
GPUExecutor
:
# noqa: E721
self
.
model_executor
.
start_profile
()
else
:
self
.
model_executor
.
_run_workers
(
"start_profile"
)
def
stop_profile
(
self
)
->
None
:
# using type instead of isinstance to check to avoid capturing
# inherited classes (MultiprocessingGPUExecutor)
if
type
(
self
.
model_executor
)
==
GPUExecutor
:
# noqa: E721
self
.
model_executor
.
stop_profile
()
else
:
self
.
model_executor
.
_run_workers
(
"stop_profile"
)
def
is_tracing_enabled
(
self
)
->
bool
:
def
is_tracing_enabled
(
self
)
->
bool
:
return
self
.
tracer
is
not
None
return
self
.
tracer
is
not
None
...
@@ -1913,46 +1907,44 @@ class LLMEngine:
...
@@ -1913,46 +1907,44 @@ class LLMEngine:
metrics
=
seq_group
.
metrics
metrics
=
seq_group
.
metrics
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
ttft
=
metrics
.
first_token_time
-
metrics
.
arrival_time
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
e2e_time
=
metrics
.
finished_time
-
metrics
.
arrival_time
# attribute names are based on
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI_RESPONSE_MODEL
,
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
seq_span
.
set_attribute
(
SpanAttributes
.
LLM_RESPONSE_MODEL
,
self
.
model_config
.
model
)
self
.
model_config
.
model
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_ID
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_ID
,
seq_group
.
request_id
)
seq_group
.
request_id
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_TEMPERATURE
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_TEMPERATURE
,
seq_group
.
sampling_params
.
temperature
)
seq_group
.
sampling_params
.
temperature
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_TOP_P
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_TOP_P
,
seq_group
.
sampling_params
.
top_p
)
seq_group
.
sampling_params
.
top_p
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_MAX_TOKENS
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_MAX_TOKENS
,
seq_group
.
sampling_params
.
max_tokens
)
seq_group
.
sampling_params
.
max_tokens
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_REQUEST_N
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_REQUEST_N
,
seq_group
.
sampling_params
.
n
)
seq_group
.
sampling_params
.
n
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_USAGE_NUM_SEQUENCES
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_USAGE_NUM_SEQUENCES
,
seq_group
.
num_seqs
())
seq_group
.
num_seqs
())
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_USAGE_PROMPT_TOKENS
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_USAGE_PROMPT_TOKENS
,
len
(
seq_group
.
prompt_token_ids
))
len
(
seq_group
.
prompt_token_ids
))
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_USAGE_COMPLETION_TOKENS
,
SpanAttributes
.
GEN_AI
_USAGE_COMPLETION_TOKENS
,
sum
([
sum
([
seq
.
get_output_len
()
seq
.
get_output_len
()
for
seq
in
seq_group
.
get_finished_seqs
()
for
seq
in
seq_group
.
get_finished_seqs
()
]))
]))
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_QUEUE
,
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_QUEUE
,
metrics
.
time_in_queue
)
metrics
.
time_in_queue
)
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_TO_FIRST_TOKEN
,
ttft
)
SpanAttributes
.
GEN_AI
_LATENCY_TIME_TO_FIRST_TOKEN
,
ttft
)
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_E2E
,
e2e_time
)
seq_span
.
set_attribute
(
SpanAttributes
.
GEN_AI
_LATENCY_E2E
,
e2e_time
)
if
metrics
.
scheduler_time
is
not
None
:
if
metrics
.
scheduler_time
is
not
None
:
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_SCHEDULER
,
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_SCHEDULER
,
metrics
.
scheduler_time
)
metrics
.
scheduler_time
)
if
metrics
.
model_forward_time
is
not
None
:
if
metrics
.
model_forward_time
is
not
None
:
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_MODEL_FORWARD
,
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_MODEL_FORWARD
,
metrics
.
model_forward_time
/
1000.0
)
metrics
.
model_forward_time
/
1000.0
)
if
metrics
.
model_execute_time
is
not
None
:
if
metrics
.
model_execute_time
is
not
None
:
seq_span
.
set_attribute
(
seq_span
.
set_attribute
(
SpanAttributes
.
LLM
_LATENCY_TIME_IN_MODEL_EXECUTE
,
SpanAttributes
.
GEN_AI
_LATENCY_TIME_IN_MODEL_EXECUTE
,
metrics
.
model_execute_time
)
metrics
.
model_execute_time
)
def
_validate_model_inputs
(
self
,
inputs
:
ProcessorInputs
,
def
_validate_model_inputs
(
self
,
inputs
:
ProcessorInputs
,
...
...
vllm/engine/metrics.py
View file @
afd0da21
...
@@ -120,7 +120,8 @@ class Metrics:
...
@@ -120,7 +120,8 @@ class Metrics:
labelnames
=
labelnames
)
labelnames
=
labelnames
)
buckets
=
[
1
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8096
]
buckets
=
[
1
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8096
]
if
not
vllm_config
.
model_config
.
enforce_eager
:
if
not
vllm_config
.
model_config
.
enforce_eager
:
buckets
=
vllm_config
.
compilation_config
.
capture_sizes
.
copy
()
buckets
=
vllm_config
.
compilation_config
.
\
cudagraph_capture_sizes
.
copy
()
buckets
.
sort
()
buckets
.
sort
()
self
.
histogram_iteration_tokens
=
self
.
_histogram_cls
(
self
.
histogram_iteration_tokens
=
self
.
_histogram_cls
(
name
=
"vllm:iteration_tokens_total"
,
name
=
"vllm:iteration_tokens_total"
,
...
@@ -258,21 +259,6 @@ class Metrics:
...
@@ -258,21 +259,6 @@ class Metrics:
documentation
=
"Number of emitted tokens."
,
documentation
=
"Number of emitted tokens."
,
labelnames
=
labelnames
))
labelnames
=
labelnames
))
# Deprecated in favor of vllm:prompt_tokens_total
self
.
gauge_avg_prompt_throughput
=
self
.
_gauge_cls
(
name
=
"vllm:avg_prompt_throughput_toks_per_s"
,
documentation
=
"Average prefill throughput in tokens/s."
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
,
)
# Deprecated in favor of vllm:generation_tokens_total
self
.
gauge_avg_generation_throughput
=
self
.
_gauge_cls
(
name
=
"vllm:avg_generation_throughput_toks_per_s"
,
documentation
=
"Average generation throughput in tokens/s."
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
,
)
# end-metrics-definitions
# end-metrics-definitions
...
@@ -634,20 +620,6 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -634,20 +620,6 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
_log_histogram
(
self
.
metrics
.
histogram_max_tokens_request
,
self
.
_log_histogram
(
self
.
metrics
.
histogram_max_tokens_request
,
stats
.
max_tokens_requests
)
stats
.
max_tokens_requests
)
def
_log_prometheus_interval
(
self
,
prompt_throughput
:
float
,
generation_throughput
:
float
)
->
None
:
# Logs metrics to prometheus that are computed every logging_interval.
# Support legacy gauge metrics that make throughput calculations on
# the vLLM side. Moving forward, we should use counters like
# counter_prompt_tokens, counter_generation_tokens
# Which log raw data and calculate summaries using rate() on the
# grafana/prometheus side. See
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
self
.
metrics
.
gauge_avg_prompt_throughput
.
labels
(
**
self
.
labels
).
set
(
prompt_throughput
)
self
.
metrics
.
gauge_avg_generation_throughput
.
labels
(
**
self
.
labels
).
set
(
generation_throughput
)
def
log
(
self
,
stats
:
Stats
):
def
log
(
self
,
stats
:
Stats
):
"""Logs to prometheus and tracked stats every iteration."""
"""Logs to prometheus and tracked stats every iteration."""
# Log to prometheus.
# Log to prometheus.
...
@@ -663,20 +635,6 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -663,20 +635,6 @@ class PrometheusStatLogger(StatLoggerBase):
# Log locally every local_interval seconds.
# Log locally every local_interval seconds.
if
local_interval_elapsed
(
stats
.
now
,
self
.
last_local_log
,
if
local_interval_elapsed
(
stats
.
now
,
self
.
last_local_log
,
self
.
local_interval
):
self
.
local_interval
):
# Compute summary metrics for tracked stats (and log them
# to promethus if applicable).
prompt_throughput
=
get_throughput
(
self
.
num_prompt_tokens
,
now
=
stats
.
now
,
last_log
=
self
.
last_local_log
)
generation_throughput
=
get_throughput
(
self
.
num_generation_tokens
,
now
=
stats
.
now
,
last_log
=
self
.
last_local_log
)
self
.
_log_prometheus_interval
(
prompt_throughput
=
prompt_throughput
,
generation_throughput
=
generation_throughput
)
if
self
.
spec_decode_metrics
is
not
None
:
if
self
.
spec_decode_metrics
is
not
None
:
self
.
_log_gauge
(
self
.
_log_gauge
(
self
.
metrics
.
gauge_spec_decode_draft_acceptance_rate
,
self
.
metrics
.
gauge_spec_decode_draft_acceptance_rate
,
...
...
vllm/engine/multiprocessing/__init__.py
View file @
afd0da21
from
dataclasses
import
dataclass
import
uuid
from
dataclasses
import
dataclass
,
field
from
enum
import
Enum
from
enum
import
Enum
from
typing
import
List
,
Mapping
,
Optional
,
Union
,
overload
from
typing
import
List
,
Mapping
,
Optional
,
Union
,
overload
...
@@ -120,10 +121,28 @@ class RPCUProfileRequest(Enum):
...
@@ -120,10 +121,28 @@ class RPCUProfileRequest(Enum):
STOP_PROFILE
=
2
STOP_PROFILE
=
2
class
RPCResetPrefixCacheRequest
(
Enum
):
RESET_PREFIX_CACHE
=
1
@
dataclass
class
RPCLoadAdapterRequest
:
lora_request
:
LoRARequest
# Set the default value of request_id to a new UUID
request_id
:
str
=
field
(
default_factory
=
lambda
:
str
(
uuid
.
uuid4
()))
@
dataclass
class
RPCAdapterLoadedResponse
:
request_id
:
str
RPC_REQUEST_T
=
Union
[
RPCProcessRequest
,
RPCAbortRequest
,
RPCStartupRequest
,
RPC_REQUEST_T
=
Union
[
RPCProcessRequest
,
RPCAbortRequest
,
RPCStartupRequest
,
RPCUProfileRequest
]
RPCUProfileRequest
,
RPCLoadAdapterRequest
,
RPCResetPrefixCacheRequest
]
REQUEST_OUTPUTS_T
=
Union
[
List
[
RequestOutput
],
RPCError
]
REQUEST_OUTPUTS_T
=
Union
[
List
[
RequestOutput
],
RPCAdapterLoadedResponse
,
RPCError
]
def
ENGINE_DEAD_ERROR
(
def
ENGINE_DEAD_ERROR
(
...
...
Prev
1
…
25
26
27
28
29
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment