Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
817 additions
and
387 deletions
+817
-387
vllm/v1/executor/abstract.py
vllm/v1/executor/abstract.py
+1
-1
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+9
-7
vllm/v1/executor/ray_executor.py
vllm/v1/executor/ray_executor.py
+3
-3
vllm/v1/executor/uniproc_executor.py
vllm/v1/executor/uniproc_executor.py
+8
-5
vllm/v1/kv_offload/cpu.py
vllm/v1/kv_offload/cpu.py
+7
-7
vllm/v1/kv_offload/worker/cpu_gpu.py
vllm/v1/kv_offload/worker/cpu_gpu.py
+175
-86
vllm/v1/outputs.py
vllm/v1/outputs.py
+4
-0
vllm/v1/request.py
vllm/v1/request.py
+5
-3
vllm/v1/sample/rejection_sampler.py
vllm/v1/sample/rejection_sampler.py
+1
-1
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+6
-1
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+2
-2
vllm/v1/structured_output/backend_xgrammar.py
vllm/v1/structured_output/backend_xgrammar.py
+24
-8
vllm/v1/worker/cp_utils.py
vllm/v1/worker/cp_utils.py
+42
-0
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+258
-246
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+8
-2
vllm/v1/worker/kv_connector_model_runner_mixin.py
vllm/v1/worker/kv_connector_model_runner_mixin.py
+3
-10
vllm/v1/worker/tpu_worker.py
vllm/v1/worker/tpu_worker.py
+3
-2
vllm/v1/worker/utils.py
vllm/v1/worker/utils.py
+13
-3
vllm/v1/worker/workspace.py
vllm/v1/worker/workspace.py
+245
-0
No files found.
vllm/v1/executor/abstract.py
View file @
a3f8d5dd
...
...
@@ -219,7 +219,7 @@ class Executor(ABC):
def
sample_tokens
(
self
,
grammar_output
:
GrammarOutput
|
None
,
non_block
:
bool
=
False
)
->
ModelRunnerOutput
|
None
|
Future
[
ModelRunnerOutput
|
None
]:
)
->
ModelRunnerOutput
|
Future
[
ModelRunnerOutput
]:
output
=
self
.
collective_rpc
(
# type: ignore[call-overload]
"sample_tokens"
,
args
=
(
grammar_output
,),
non_block
=
non_block
)
...
...
vllm/v1/executor/multiproc_executor.py
View file @
a3f8d5dd
...
...
@@ -124,9 +124,7 @@ class MultiprocExecutor(Executor):
# Set multiprocessing envs
set_multiprocessing_worker_envs
()
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# get_loopback_ip() for communication.
# use the loopback address get_loopback_ip() for communication.
distributed_init_method
=
get_distributed_init_method
(
get_loopback_ip
(),
get_open_port
()
)
...
...
@@ -294,8 +292,8 @@ class MultiprocExecutor(Executor):
kwargs
:
dict
|
None
=
None
,
non_block
:
bool
=
False
,
unique_reply_rank
:
int
|
None
=
None
,
kv_output_aggregator
:
KVOutputAggregator
=
None
,
)
->
Any
|
list
[
Any
]
|
Future
[
Any
|
list
[
Any
]]
:
kv_output_aggregator
:
KVOutputAggregator
|
None
=
None
,
)
->
Any
:
"""Returns single result if unique_reply_rank and/or kv_output_aggregator
is provided, otherwise list."""
assert
self
.
rpc_broadcast_mq
is
not
None
,
(
...
...
@@ -476,6 +474,8 @@ class WorkerProc:
"""Wrapper that runs one Worker in a separate process."""
READY_STR
=
"READY"
rpc_broadcast_mq
:
MessageQueue
|
None
worker_response_mq
:
MessageQueue
|
None
def
_init_message_queues
(
self
,
input_shm_handle
:
Handle
,
vllm_config
:
VllmConfig
...
...
@@ -487,7 +487,7 @@ class WorkerProc:
)
# Initializes a message queue for sending the model output
self
.
worker_response_mq
:
MessageQueue
=
MessageQueue
(
1
,
1
)
self
.
worker_response_mq
=
MessageQueue
(
1
,
1
)
self
.
peer_response_handles
=
[]
else
:
# Initialize remote MessageQueue for receiving SchedulerOutput across nodes
...
...
@@ -706,7 +706,7 @@ class WorkerProc:
death_pipe
.
recv
()
except
EOFError
:
# Parent process has exited, terminate this worker
logger
.
info
(
"Parent process exited, terminating worker"
)
logger
.
info
_once
(
"Parent process exited, terminating worker"
)
# Send signal to self to trigger clean shutdown
shutdown_event
.
set
()
except
Exception
as
e
:
...
...
@@ -720,6 +720,7 @@ class WorkerProc:
try
:
reader
.
close
()
worker
=
WorkerProc
(
*
args
,
**
kwargs
)
assert
worker
.
worker_response_mq
is
not
None
# Send READY once we know everything is loaded
ready_writer
.
send
(
...
...
@@ -804,6 +805,7 @@ class WorkerProc:
def
worker_busy_loop
(
self
,
cancel
:
threading
.
Event
|
None
=
None
):
"""Main busy loop for Multiprocessing Workers"""
assert
self
.
rpc_broadcast_mq
is
not
None
while
True
:
method
,
args
,
kwargs
,
output_rank
=
self
.
rpc_broadcast_mq
.
dequeue
(
cancel
=
cancel
,
indefinite
=
True
...
...
vllm/v1/executor/ray_executor.py
View file @
a3f8d5dd
...
...
@@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor):
self
,
grammar_output
:
"GrammarOutput | None"
,
non_block
:
bool
=
False
,
)
->
ModelRunnerOutput
|
Future
[
ModelRunnerOutput
]:
)
->
ModelRunnerOutput
|
None
|
Future
[
ModelRunnerOutput
|
None
]:
"""Execute the model on the Ray workers.
The scheduler output to use should have been provided in
...
...
@@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor):
"""
scheduler_output
=
self
.
scheduler_output
if
scheduler_output
is
None
:
return
COMPLETED_NONE_FUTURE
if
non_block
else
None
# noqa
return
COMPLETED_NONE_FUTURE
if
non_block
else
None
self
.
scheduler_output
=
None
...
...
@@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor):
scheduler_output
:
SchedulerOutput
,
grammar_output
:
"GrammarOutput | None"
,
non_block
:
bool
=
False
,
)
->
ModelRunnerOutput
|
Future
[
ModelRunnerOutput
]:
)
->
ModelRunnerOutput
|
None
|
Future
[
ModelRunnerOutput
|
None
]:
# Build the compiled DAG for the first time.
if
self
.
forward_dag
is
None
:
# type: ignore
self
.
forward_dag
=
self
.
_compiled_ray_dag
(
enable_asyncio
=
False
)
...
...
vllm/v1/executor/uniproc_executor.py
View file @
a3f8d5dd
...
...
@@ -67,7 +67,7 @@ class UniProcExecutor(Executor):
kwargs
:
dict
|
None
=
None
,
non_block
:
bool
=
False
,
single_value
:
bool
=
False
,
)
->
Any
|
list
[
Any
]
|
Future
[
Any
|
list
[
Any
]]
:
)
->
Any
:
if
kwargs
is
None
:
kwargs
=
{}
...
...
@@ -79,10 +79,13 @@ class UniProcExecutor(Executor):
result
=
run_method
(
self
.
driver_worker
,
method
,
args
,
kwargs
)
if
isinstance
(
result
,
AsyncModelRunnerOutput
):
if
(
async_thread
:
=
self
.
async_output_thread
)
is
not
None
:
get_output
=
result
.
get_output
if
not
single_value
:
get_output
=
lambda
go
=
result
.
get_output
:
[
go
()]
return
async_thread
.
submit
(
get_output
)
if
single_value
:
return
async_thread
.
submit
(
result
.
get_output
)
def
get_output_list
()
->
list
[
Any
]:
return
[
result
.
get_output
()]
return
async_thread
.
submit
(
get_output_list
)
result
=
result
.
get_output
()
future
=
Future
[
Any
]()
future
.
set_result
(
result
if
single_value
else
[
result
])
...
...
vllm/v1/kv_offload/cpu.py
View file @
a3f8d5dd
...
...
@@ -13,7 +13,7 @@ from vllm.v1.kv_offload.backends.cpu import CPUBackend
from
vllm.v1.kv_offload.lru_manager
import
LRUOffloadingManager
from
vllm.v1.kv_offload.mediums
import
CPULoadStoreSpec
,
GPULoadStoreSpec
from
vllm.v1.kv_offload.spec
import
OffloadingSpec
from
vllm.v1.kv_offload.worker.cpu_gpu
import
CpuGpuOffloadingHandler
from
vllm.v1.kv_offload.worker.cpu_gpu
import
CpuGpuOffloadingHandler
s
from
vllm.v1.kv_offload.worker.worker
import
OffloadingHandler
...
...
@@ -32,7 +32,7 @@ class CPUOffloadingSpec(OffloadingSpec):
self
.
_manager
:
OffloadingManager
|
None
=
None
# worker-side
self
.
_handler
:
OffloadingHandler
|
None
=
None
self
.
_handler
s
:
CpuGpu
OffloadingHandler
s
|
None
=
None
self
.
eviction_policy
:
str
=
self
.
extra_config
.
get
(
"eviction_policy"
,
"lru"
)
...
...
@@ -67,13 +67,13 @@ class CPUOffloadingSpec(OffloadingSpec):
kv_caches
:
dict
[
str
,
torch
.
Tensor
],
attn_backends
:
dict
[
str
,
type
[
AttentionBackend
]],
)
->
Iterator
[
tuple
[
type
[
LoadStoreSpec
],
type
[
LoadStoreSpec
],
OffloadingHandler
]]:
if
not
self
.
_handler
:
if
not
self
.
_handler
s
:
if
not
current_platform
.
is_cuda_alike
():
raise
Exception
(
"CPU Offloading is currently only supported on CUDA-alike GPUs"
)
self
.
_handler
=
CpuGpuOffloadingHandler
(
self
.
_handler
s
=
CpuGpuOffloadingHandler
s
(
attn_backends
=
attn_backends
,
gpu_block_size
=
self
.
gpu_block_size
,
cpu_block_size
=
self
.
offloaded_block_size
,
...
...
@@ -81,6 +81,6 @@ class CPUOffloadingSpec(OffloadingSpec):
gpu_caches
=
kv_caches
,
)
assert
self
.
_handler
is
not
None
yield
GPULoadStoreSpec
,
CPULoadStoreSpec
,
self
.
_handler
yield
CPULoadStoreSpec
,
GPULoadStoreSpec
,
self
.
_handler
assert
self
.
_handler
s
is
not
None
yield
GPULoadStoreSpec
,
CPULoadStoreSpec
,
self
.
_handler
s
.
gpu_to_cpu_handler
yield
CPULoadStoreSpec
,
GPULoadStoreSpec
,
self
.
_handler
s
.
cpu_to_gpu_handler
vllm/v1/kv_offload/worker/cpu_gpu.py
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections
import
deque
import
numpy
as
np
import
torch
...
...
@@ -8,7 +9,7 @@ from vllm import _custom_ops as ops
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.logger
import
init_logger
from
vllm.utils.platform_utils
import
is_pin_memory_available
from
vllm.v1.kv_offload.mediums
import
CPULoadStoreSpec
,
GPU
LoadStoreSpec
from
vllm.v1.kv_offload.mediums
import
BlockIDs
LoadStoreSpec
from
vllm.v1.kv_offload.worker.worker
import
(
OffloadingHandler
,
TransferResult
,
...
...
@@ -51,7 +52,123 @@ def expand_block_ids(
output_idx
=
output_end_idx
class
CpuGpuOffloadingHandler
(
OffloadingHandler
):
class
SingleDirectionOffloadingHandler
(
OffloadingHandler
):
"""
SingleDirectionOffloadingHandler handles transfers for a single direction,
either CPU->GPU or GPU->CPU.
Transfers are guaranteed to be executed in order of their submission.
Each transfer uses a unique CUDA stream, and its stream will start
executing only after the streams of previous transfers have finished.
"""
def
__init__
(
self
,
src_tensors
:
list
[
torch
.
Tensor
],
dst_tensors
:
list
[
torch
.
Tensor
],
kv_dim_before_num_blocks
:
list
[
bool
],
src_block_size_factor
:
int
,
dst_block_size_factor
:
int
,
priority
:
int
,
):
"""
Initialize a SingleDirectionOffloadingHandler.
Args:
src_tensors: list of KV cache tensors to copy from.
dst_tensors: list of KV cache tensors to copy to.
Order should match src_tensors.
kv_dim_before_num_blocks: list of bools, indicating
whether the respective KV cache tensor has a KV
dimension before its num_blocks dimension.
e.g. (2, num_blocks, ...)
src_block_size_factor: The number of kernel blocks
per KV block in a source tensor.
dst_block_size_factor: The number of kernel blocks
per KV block in a destination tensor.
priority: The priority of the backing CUDA streams.
Lower numbers indicate higher priority.
"""
assert
len
(
src_tensors
)
==
len
(
dst_tensors
)
==
len
(
kv_dim_before_num_blocks
)
self
.
src_tensors
:
list
[
torch
.
Tensor
]
=
src_tensors
self
.
dst_tensors
:
list
[
torch
.
Tensor
]
=
dst_tensors
self
.
kv_dim_before_num_blocks
:
list
[
bool
]
=
kv_dim_before_num_blocks
self
.
src_block_size_factor
:
int
=
src_block_size_factor
self
.
dst_block_size_factor
:
int
=
dst_block_size_factor
self
.
priority
=
priority
# queue of transfers (job_id, stream, event)
self
.
_transfers
:
deque
[
tuple
[
int
,
torch
.
cuda
.
Stream
,
torch
.
Event
]]
=
deque
()
# list of CUDA streams available for re-use
self
.
_stream_pool
:
list
[
torch
.
cuda
.
Stream
]
=
[]
# list of CUDA events available for re-use
self
.
_event_pool
:
list
[
torch
.
Event
]
=
[]
def
transfer_async
(
self
,
job_id
:
int
,
transfer_spec
:
TransferSpec
)
->
bool
:
src_spec
,
dst_spec
=
transfer_spec
assert
isinstance
(
src_spec
,
BlockIDsLoadStoreSpec
)
assert
isinstance
(
dst_spec
,
BlockIDsLoadStoreSpec
)
src_blocks
=
src_spec
.
block_ids
dst_blocks
=
dst_spec
.
block_ids
assert
src_blocks
.
ndim
==
1
assert
dst_blocks
.
ndim
==
1
src_sub_block_count
=
src_blocks
.
size
*
self
.
src_block_size_factor
dst_sub_block_count
=
dst_blocks
.
size
*
self
.
dst_block_size_factor
src_sub_blocks_to_skip
=
-
dst_blocks
.
size
%
self
.
src_block_size_factor
assert
dst_sub_block_count
==
src_sub_block_count
-
src_sub_blocks_to_skip
src_to_dst
=
np
.
empty
((
dst_sub_block_count
,
2
),
dtype
=
np
.
int64
)
expand_block_ids
(
src_blocks
,
self
.
src_block_size_factor
,
src_to_dst
[:,
0
],
skip_count
=
src_sub_blocks_to_skip
,
)
expand_block_ids
(
dst_blocks
,
self
.
dst_block_size_factor
,
src_to_dst
[:,
1
])
src_to_dst_tensor
=
torch
.
from_numpy
(
src_to_dst
)
stream
=
(
self
.
_stream_pool
.
pop
()
if
self
.
_stream_pool
else
torch
.
cuda
.
Stream
(
priority
=
self
.
priority
)
)
event
=
self
.
_event_pool
.
pop
()
if
self
.
_event_pool
else
torch
.
Event
()
if
self
.
_transfers
:
_
,
_
,
last_event
=
self
.
_transfers
[
-
1
]
# assure job will start only after the previous one completes
stream
.
wait_event
(
last_event
)
with
torch
.
cuda
.
stream
(
stream
):
for
src_tensor
,
dst_tensor
,
kv_dim
in
zip
(
self
.
src_tensors
,
self
.
dst_tensors
,
self
.
kv_dim_before_num_blocks
):
if
kv_dim
:
src_key_cache
,
src_value_cache
=
src_tensor
dst_key_cache
,
dst_value_cache
=
dst_tensor
ops
.
swap_blocks
(
src_key_cache
,
dst_key_cache
,
src_to_dst_tensor
)
ops
.
swap_blocks
(
src_value_cache
,
dst_value_cache
,
src_to_dst_tensor
)
else
:
ops
.
swap_blocks
(
src_tensor
,
dst_tensor
,
src_to_dst_tensor
)
event
.
record
(
stream
)
self
.
_transfers
.
append
((
job_id
,
stream
,
event
))
# success
return
True
def
get_finished
(
self
)
->
list
[
TransferResult
]:
results
:
list
[
TransferResult
]
=
[]
while
self
.
_transfers
and
self
.
_transfers
[
0
][
2
].
query
():
job_id
,
stream
,
event
=
self
.
_transfers
.
popleft
()
results
.
append
((
job_id
,
True
))
self
.
_stream_pool
.
append
(
stream
)
self
.
_event_pool
.
append
(
event
)
return
results
class
CpuGpuOffloadingHandlers
:
def
__init__
(
self
,
gpu_block_size
:
int
,
...
...
@@ -60,27 +177,20 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
gpu_caches
:
dict
[
str
,
torch
.
Tensor
],
attn_backends
:
dict
[
str
,
type
[
AttentionBackend
]],
):
assert
gpu_caches
assert
cpu_block_size
%
gpu_block_size
==
0
self
.
block_size_factor
=
cpu_block_size
//
gpu_block_size
# cuda streams for gpu->cpu and cpu->gpu
self
.
d2h_stream
=
torch
.
cuda
.
Stream
()
self
.
h2d_stream
=
torch
.
cuda
.
Stream
()
# job_id -> transfer cuda event
self
.
transfer_events
:
dict
[
int
,
torch
.
Event
]
=
{}
# list of cuda events available for re-use
self
.
events_pool
:
list
[
torch
.
Event
]
=
[]
block_size_factor
=
cpu_block_size
//
gpu_block_size
pin_memory
=
is_pin_memory_available
()
# allocate cpu tensors
logger
.
info
(
"Allocating %d CPU tensors..."
,
len
(
gpu_caches
))
self
.
gpu_tensors
:
list
[
torch
.
Tensor
]
=
[]
self
.
cpu_tensors
:
list
[
torch
.
Tensor
]
=
[]
self
.
kv_dim_before_num_blocks
:
list
[
bool
]
=
[]
gpu_tensors
:
list
[
torch
.
Tensor
]
=
[]
cpu_tensors
:
list
[
torch
.
Tensor
]
=
[]
kv_dim_before_num_blocks
:
list
[
bool
]
=
[]
kernel_block_size
:
int
|
None
=
None
for
layer_name
,
gpu_tensor
in
gpu_caches
.
items
():
self
.
gpu_tensors
.
append
(
gpu_tensor
)
gpu_tensors
.
append
(
gpu_tensor
)
gpu_shape
=
gpu_tensor
.
shape
attn_backend
=
attn_backends
[
layer_name
]
...
...
@@ -88,16 +198,21 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
num_blocks
=
1234
,
block_size
=
16
,
num_kv_heads
=
8
,
head_size
=
256
)
has_layers_dim
=
False
if
len
(
gpu_shape
)
!=
len
(
test_shape
):
# cross-layers tensor
# shape is (num_blocks, ...)
assert
len
(
gpu_shape
)
==
len
(
test_shape
)
+
1
num_blocks_idx
=
0
self
.
kv_dim_before_num_blocks
.
append
(
False
)
has_layers_dim
=
True
kv_dim_before_num_blocks
.
append
(
False
)
# prepend a dummy num_layers=80 to test_shape
test_shape
=
(
80
,)
+
test_shape
elif
test_shape
[
0
]
==
1234
:
# shape is (num_blocks, ...)
num_blocks_idx
=
0
self
.
kv_dim_before_num_blocks
.
append
(
False
)
kv_dim_before_num_blocks
.
append
(
False
)
else
:
# shape should be (2, num_blocks, ...)
assert
test_shape
[
0
]
==
2
...
...
@@ -105,13 +220,32 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
assert
gpu_shape
[
0
]
==
2
num_blocks_idx
=
1
self
.
kv_dim_before_num_blocks
.
append
(
True
)
kv_dim_before_num_blocks
.
append
(
True
)
try
:
kv_cache_stride_order
=
attn_backend
.
get_kv_cache_stride_order
(
include_num_layers_dimension
=
has_layers_dim
)
assert
len
(
kv_cache_stride_order
)
==
len
(
gpu_shape
)
except
(
AttributeError
,
NotImplementedError
):
kv_cache_stride_order
=
tuple
(
range
(
len
(
gpu_shape
)))
# permute test_shape according to stride_order
test_shape
=
tuple
(
test_shape
[
i
]
for
i
in
kv_cache_stride_order
)
# find block_size (16) dimension index
block_size_idx
=
test_shape
.
index
(
16
)
if
kernel_block_size
is
not
None
:
assert
kernel_block_size
==
gpu_shape
[
block_size_idx
]
else
:
kernel_block_size
=
gpu_shape
[
block_size_idx
]
assert
gpu_block_size
%
kernel_block_size
==
0
cpu_shape
=
list
(
gpu_shape
)
cpu_shape
[
num_blocks_idx
]
=
num_cpu_blocks
*
self
.
block_size_factor
cpu_shape
[
num_blocks_idx
]
=
num_cpu_blocks
*
block_size_factor
logger
.
debug
(
"Allocating CPU tensor of shape %r"
,
cpu_shape
)
self
.
cpu_tensors
.
append
(
cpu_tensors
.
append
(
torch
.
zeros
(
cpu_shape
,
dtype
=
gpu_tensor
.
dtype
,
...
...
@@ -120,72 +254,27 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
)
)
def
transfer_async
(
self
,
job_id
:
int
,
spec
:
TransferSpec
)
->
bool
:
src_spec
,
dst_spec
=
spec
if
isinstance
(
src_spec
,
CPULoadStoreSpec
):
assert
isinstance
(
dst_spec
,
GPULoadStoreSpec
)
stream
=
self
.
h2d_stream
src_tensors
=
self
.
cpu_tensors
dst_tensors
=
self
.
gpu_tensors
src_block_size_factor
=
self
.
block_size_factor
dst_block_size_factor
=
1
else
:
assert
isinstance
(
src_spec
,
GPULoadStoreSpec
)
assert
isinstance
(
dst_spec
,
CPULoadStoreSpec
)
stream
=
self
.
d2h_stream
src_tensors
=
self
.
gpu_tensors
dst_tensors
=
self
.
cpu_tensors
src_block_size_factor
=
1
dst_block_size_factor
=
self
.
block_size_factor
src_blocks
=
src_spec
.
block_ids
dst_blocks
=
dst_spec
.
block_ids
assert
src_blocks
.
ndim
==
1
assert
dst_blocks
.
ndim
==
1
assert
kernel_block_size
is
not
None
gpu_block_size_factor
=
gpu_block_size
//
kernel_block_size
cpu_block_size_factor
=
cpu_block_size
//
kernel_block_size
src_sub_block_count
=
src_blocks
.
size
*
src_block_size_factor
dst_sub_block_count
=
dst_blocks
.
size
*
dst_block_size_factor
src_sub_blocks_to_skip
=
-
dst_blocks
.
size
%
src_block_size_factor
# TODO (orozery): adapt swap_blocks to support gpu_block_size_factor
assert
gpu_block_size_factor
==
1
assert
dst_sub_block_count
==
src_sub_block_count
-
src_sub_blocks_to_skip
src_to_dst
=
np
.
empty
((
dst_sub_block_count
,
2
),
dtype
=
np
.
int64
)
expand_block_ids
(
src_blocks
,
src_block_size_factor
,
src_to_dst
[:,
0
],
skip_count
=
src_sub_blocks_to_skip
,
self
.
gpu_to_cpu_handler
=
SingleDirectionOffloadingHandler
(
src_tensors
=
gpu_tensors
,
dst_tensors
=
cpu_tensors
,
kv_dim_before_num_blocks
=
kv_dim_before_num_blocks
,
src_block_size_factor
=
gpu_block_size_factor
,
dst_block_size_factor
=
cpu_block_size_factor
,
priority
=
1
,
)
expand_block_ids
(
dst_blocks
,
dst_block_size_factor
,
src_to_dst
[:,
1
])
src_to_dst_tensor
=
torch
.
from_numpy
(
src_to_dst
)
event
=
self
.
events_pool
.
pop
()
if
self
.
events_pool
else
torch
.
Event
()
with
torch
.
cuda
.
stream
(
stream
):
for
src_tensor
,
dst_tensor
,
kv_dim
in
zip
(
src_tensors
,
dst_tensors
,
self
.
kv_dim_before_num_blocks
):
if
kv_dim
:
src_key_cache
=
src_tensor
[
0
]
dst_key_cache
=
dst_tensor
[
0
]
ops
.
swap_blocks
(
src_key_cache
,
dst_key_cache
,
src_to_dst_tensor
)
src_value_cache
=
src_tensor
[
1
]
dst_value_cache
=
dst_tensor
[
1
]
ops
.
swap_blocks
(
src_value_cache
,
dst_value_cache
,
src_to_dst_tensor
)
else
:
ops
.
swap_blocks
(
src_tensor
,
dst_tensor
,
src_to_dst_tensor
)
event
.
record
(
stream
)
self
.
transfer_events
[
job_id
]
=
event
# success
return
True
def
get_finished
(
self
)
->
list
[
TransferResult
]:
results
:
list
[
TransferResult
]
=
[]
for
job_id
,
event
in
self
.
transfer_events
.
items
():
if
event
.
query
():
results
.
append
((
job_id
,
True
))
self
.
events_pool
.
append
(
event
)
for
job_id
,
_
in
results
:
del
self
.
transfer_events
[
job_id
]
return
results
self
.
cpu_to_gpu_handler
=
SingleDirectionOffloadingHandler
(
src_tensors
=
cpu_tensors
,
dst_tensors
=
gpu_tensors
,
kv_dim_before_num_blocks
=
kv_dim_before_num_blocks
,
src_block_size_factor
=
cpu_block_size_factor
,
dst_block_size_factor
=
gpu_block_size_factor
,
priority
=-
1
,
)
vllm/v1/outputs.py
View file @
a3f8d5dd
...
...
@@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat
from
vllm.v1.core.sched.output
import
SchedulerOutput
if
TYPE_CHECKING
:
from
vllm.distributed.kv_events
import
KVConnectorKVEvents
from
vllm.distributed.kv_transfer.kv_connector.v1.metrics
import
KVConnectorStats
else
:
KVConnectorStats
=
object
KVConnectorKVEvents
=
object
class
LogprobsLists
(
NamedTuple
):
...
...
@@ -108,6 +110,7 @@ class KVConnectorOutput:
finished_sending
:
set
[
str
]
|
None
=
None
finished_recving
:
set
[
str
]
|
None
=
None
kv_connector_stats
:
KVConnectorStats
|
None
=
None
kv_cache_events
:
KVConnectorKVEvents
|
None
=
None
# IDs of externally computed KV blocks that failed to load.
# Requests referencing these blocks should be rescheduled to recompute them
invalid_block_ids
:
set
[
int
]
=
field
(
default_factory
=
set
)
...
...
@@ -123,6 +126,7 @@ class KVConnectorOutput:
not
self
.
finished_sending
and
not
self
.
finished_recving
and
not
self
.
kv_connector_stats
and
not
self
.
kv_cache_events
and
not
self
.
invalid_block_ids
)
...
...
vllm/v1/request.py
View file @
a3f8d5dd
...
...
@@ -209,10 +209,10 @@ class Request:
def
get_finished_reason
(
self
)
->
FinishReason
|
None
:
return
RequestStatus
.
get_finished_reason
(
self
.
status
)
def
get_num_encoder_
token
s
(
self
,
input_id
:
int
)
->
int
:
def
get_num_encoder_
embed
s
(
self
,
input_id
:
int
)
->
int
:
assert
input_id
<
len
(
self
.
mm_features
)
num_
token
s
=
self
.
mm_features
[
input_id
].
mm_position
.
length
return
num_
token
s
num_
embed
s
=
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
return
num_
embed
s
def
record_event
(
self
,
...
...
@@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum):
FINISHED_LENGTH_CAPPED
=
enum
.
auto
()
FINISHED_ABORTED
=
enum
.
auto
()
FINISHED_IGNORED
=
enum
.
auto
()
FINISHED_ERROR
=
enum
.
auto
()
def
__str__
(
self
):
return
self
.
name
...
...
@@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = {
RequestStatus
.
FINISHED_LENGTH_CAPPED
:
FinishReason
.
LENGTH
,
RequestStatus
.
FINISHED_ABORTED
:
FinishReason
.
ABORT
,
RequestStatus
.
FINISHED_IGNORED
:
FinishReason
.
LENGTH
,
RequestStatus
.
FINISHED_ERROR
:
FinishReason
.
ERROR
,
}
vllm/v1/sample/rejection_sampler.py
View file @
a3f8d5dd
...
...
@@ -145,7 +145,7 @@ class RejectionSampler(nn.Module):
)
logprobs_tensors
=
None
if
sampling_metadata
.
max_num_logprobs
:
if
sampling_metadata
.
max_num_logprobs
is
not
None
:
logprobs_tensors
=
self
.
_get_logprobs_tensors
(
sampling_metadata
.
max_num_logprobs
,
metadata
,
...
...
vllm/v1/spec_decode/eagle.py
View file @
a3f8d5dd
...
...
@@ -170,7 +170,6 @@ class EagleProposer:
self
.
allowed_attn_types
:
tuple
|
None
=
None
if
current_platform
.
is_rocm
():
rocm_types
=
[
TritonAttentionMetadata
,
FlashAttentionMetadata
]
# ROCM_AITER_FA is an optional backend
# if find_spec(
# AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False)
...
...
@@ -180,6 +179,12 @@ class EagleProposer:
# )
# rocm_types.append(AiterFlashAttentionMetadata)
# TRITON_MLA backend support for MLA models (e.g., DeepSeek)
from
vllm.v1.attention.backends.mla.common
import
MLACommonMetadata
rocm_types
.
append
(
MLACommonMetadata
)
self
.
allowed_attn_types
=
tuple
(
rocm_types
)
# Parse the speculative token tree.
...
...
vllm/v1/structured_output/__init__.py
View file @
a3f8d5dd
...
...
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.reasoning
import
ReasoningParserManager
from
vllm.tokenizers
import
init
_tokenizer_from_config
from
vllm.tokenizers
import
cached
_tokenizer_from_config
from
vllm.utils.import_utils
import
LazyLoader
from
vllm.v1.structured_output.backend_guidance
import
GuidanceBackend
from
vllm.v1.structured_output.backend_types
import
(
...
...
@@ -71,7 +71,7 @@ class StructuredOutputManager:
# of CPUs.
max_workers
=
max
(
1
,
(
multiprocessing
.
cpu_count
()
+
1
)
//
2
)
self
.
executor
=
ThreadPoolExecutor
(
max_workers
=
max_workers
)
self
.
tokenizer
=
init
_tokenizer_from_config
(
self
.
tokenizer
=
cached
_tokenizer_from_config
(
model_config
=
self
.
vllm_config
.
model_config
)
reasoning_parser
=
(
...
...
vllm/v1/structured_output/backend_xgrammar.py
View file @
a3f8d5dd
...
...
@@ -10,7 +10,8 @@ import torch
import
vllm.envs
from
vllm.logger
import
init_logger
from
vllm.sampling_params
import
SamplingParams
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers.deepseek_v32
import
DeepseekV32Tokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.utils.import_utils
import
LazyLoader
from
vllm.v1.structured_output.backend_types
import
(
StructuredOutputBackend
,
...
...
@@ -56,6 +57,27 @@ class XgrammarBackend(StructuredOutputBackend):
stop_token_ids
=
stop_token_ids
,
add_prefix_space
=
True
,
)
elif
isinstance
(
self
.
tokenizer
,
DeepseekV32Tokenizer
):
# copy from xgr.TokenizerInfo.from_huggingface()
# because we are using a custom tokenizer wrapper here.
vocab_dict
=
self
.
tokenizer
.
get_vocab
()
tokenizer_vocab_size
=
max
(
len
(
vocab_dict
),
self
.
tokenizer
.
max_token_id
+
1
)
vocab_size
=
self
.
vocab_size
or
tokenizer_vocab_size
# maintain tokenizer's indexing
encoded_vocab
=
[
""
]
*
vocab_size
for
token
,
idx
in
vocab_dict
.
items
():
if
idx
<
vocab_size
:
encoded_vocab
[
idx
]
=
token
stop_token_ids
=
[
self
.
tokenizer
.
eos_token_id
]
backend_str
=
self
.
tokenizer
.
tokenizer
.
backend_tokenizer
.
to_str
()
metadata
=
xgr
.
TokenizerInfo
.
_detect_metadata_from_hf
(
backend_str
)
tokenizer_info
=
xgr
.
TokenizerInfo
(
encoded_vocab
=
encoded_vocab
,
vocab_type
=
metadata
[
"vocab_type"
],
vocab_size
=
vocab_size
,
stop_token_ids
=
stop_token_ids
,
add_prefix_space
=
metadata
[
"add_prefix_space"
],
)
else
:
tokenizer_info
=
xgr
.
TokenizerInfo
.
from_huggingface
(
self
.
tokenizer
,
...
...
@@ -246,13 +268,7 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
# Unsupported keywords for objects
if
obj
.
get
(
"type"
)
==
"object"
and
any
(
key
in
obj
for
key
in
(
"minProperties"
,
"maxProperties"
,
"propertyNames"
,
"patternProperties"
,
)
key
in
obj
for
key
in
(
"patternProperties"
,
"propertyNames"
)
):
return
True
...
...
vllm/v1/worker/cp_utils.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
TYPE_CHECKING
,
Any
,
cast
from
vllm.config
import
VllmConfig
,
get_layers_from_vllm_config
if
TYPE_CHECKING
:
from
vllm.model_executor.layers.attention_layer_base
import
AttentionLayerBase
else
:
AttentionLayerBase
=
object
def
check_attention_cp_compatibility
(
vllm_config
:
VllmConfig
)
->
None
:
pcp_size
=
vllm_config
.
parallel_config
.
prefill_context_parallel_size
dcp_size
=
vllm_config
.
parallel_config
.
decode_context_parallel_size
interleave_size
=
vllm_config
.
parallel_config
.
cp_kv_cache_interleave_size
if
pcp_size
*
dcp_size
>
1
:
layer_type
=
cast
(
type
[
Any
],
AttentionLayerBase
)
layers
=
get_layers_from_vllm_config
(
vllm_config
,
layer_type
)
for
layer
in
layers
.
values
():
layer_impl
=
getattr
(
layer
,
"impl"
,
None
)
if
layer_impl
is
None
:
continue
if
vllm_config
.
speculative_config
is
not
None
and
interleave_size
>
1
:
assert
layer_impl
.
supports_mtp_with_cp_non_trivial_interleave_size
,
(
"MTP with cp_kv_cache_interleave_size > 1 is not "
f
"supported in
{
layer_impl
.
__class__
.
__name__
}
."
)
if
dcp_size
>
1
:
assert
layer_impl
.
need_to_return_lse_for_decode
,
(
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f
"
{
layer_impl
.
__class__
.
__name__
}
"
"does not return the softmax lse for decode."
)
if
pcp_size
>
1
:
assert
layer_impl
.
supports_pcp
,
(
"PCP requires attention impls' support, "
f
"but the impl
{
layer_impl
.
__class__
.
__name__
}
"
"does not support PCP."
)
vllm/v1/worker/gpu_model_runner.py
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
functools
import
gc
import
itertools
import
time
...
...
@@ -148,6 +149,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from
vllm.v1.spec_decode.suffix_decoding
import
SuffixDecodingProposer
from
vllm.v1.structured_output.utils
import
apply_grammar_bitmask
from
vllm.v1.utils
import
CpuGpuBuffer
,
record_function_or_nullcontext
from
vllm.v1.worker.cp_utils
import
check_attention_cp_compatibility
from
vllm.v1.worker.dp_utils
import
coordinate_batch_across_dp
from
vllm.v1.worker.ec_connector_model_runner_mixin
import
ECConnectorModelRunnerMixin
from
vllm.v1.worker.gpu_input_batch
import
CachedRequestState
,
InputBatch
...
...
@@ -160,15 +162,14 @@ from vllm.v1.worker.ubatch_utils import (
maybe_create_ubatch_slices
,
)
from
vllm.v1.worker.utils
import
is_residual_scattered_for_sp
from
vllm.v1.worker.workspace
import
lock_workspace
from
.utils
import
(
AttentionGroup
,
MultiModalBudget
,
add_kv_sharing_layers_to_kv_cache_groups
,
bind_kv_cache
,
gather_mm_placeholders
,
sanity_check_mm_encoder_outputs
,
scatter_mm_placeholders
,
)
if
TYPE_CHECKING
:
...
...
@@ -295,6 +296,7 @@ class GPUModelRunner(
self
.
device
=
device
self
.
pin_memory
=
is_pin_memory_available
()
self
.
dtype
=
self
.
model_config
.
dtype
self
.
kv_cache_dtype
=
kv_cache_dtype_str_to_dtype
(
cache_config
.
cache_dtype
,
self
.
model_config
)
...
...
@@ -1267,6 +1269,8 @@ class GPUModelRunner(
if
not
isinstance
(
kv_cache_spec
,
CrossAttentionSpec
):
return
None
,
None
# Zero out buffer for padding requests that are not actually scheduled (CGs)
self
.
encoder_seq_lens
.
np
[:
num_reqs
]
=
0
# Build encoder_seq_lens array mapping request indices to
# encoder lengths for inputs scheduled in this batch
for
req_id
in
num_scheduled_tokens
:
...
...
@@ -1530,28 +1534,13 @@ class GPUModelRunner(
"""
:return: tuple[attn_metadata, spec_decode_common_attn_metadata]
"""
# Attention metadata is not needed for attention free models
if
len
(
self
.
kv_cache_config
.
kv_cache_groups
)
==
0
:
return
{},
None
num_tokens_padded
=
num_tokens_padded
or
num_tokens
num_reqs_padded
=
num_reqs_padded
or
num_reqs
logits_indices_padded
=
None
num_logits_indices
=
None
if
logits_indices
is
not
None
:
num_logits_indices
=
logits_indices
.
size
(
0
)
if
self
.
cache_config
.
kv_sharing_fast_prefill
:
logits_indices_padded
=
self
.
_prepare_kv_sharing_fast_prefill
(
logits_indices
)
# update seq_lens of decode reqs under DCP.
if
self
.
dcp_world_size
>
1
:
self
.
dcp_local_seq_lens
.
cpu
[:
num_reqs
]
=
get_dcp_local_seq_lens
(
self
.
seq_lens
.
cpu
[:
num_reqs
],
self
.
dcp_world_size
,
self
.
dcp_rank
,
self
.
parallel_config
.
cp_kv_cache_interleave_size
,
)
self
.
dcp_local_seq_lens
.
cpu
[
num_reqs
:].
fill_
(
0
)
self
.
dcp_local_seq_lens
.
copy_to_gpu
(
num_reqs_padded
)
assert
num_reqs_padded
is
not
None
and
num_tokens_padded
is
not
None
attn_metadata
:
PerLayerAttnMetadata
=
{}
if
ubatch_slices
is
not
None
:
...
...
@@ -1572,36 +1561,12 @@ class GPUModelRunner(
self
.
num_accepted_tokens
.
np
[
num_reqs
:].
fill
(
1
)
self
.
num_accepted_tokens
.
copy_to_gpu
()
# Used in the below loop, uses padded shapes
query_start_loc
=
self
.
query_start_loc
.
gpu
[:
num_reqs_padded
+
1
]
query_start_loc_cpu
=
self
.
query_start_loc
.
cpu
[:
num_reqs_padded
+
1
]
seq_lens
=
self
.
seq_lens
.
gpu
[:
num_reqs_padded
]
seq_lens_cpu
=
self
.
seq_lens
.
cpu
[:
num_reqs_padded
]
num_computed_tokens_cpu
=
self
.
input_batch
.
num_computed_tokens_cpu_tensor
[
:
num_reqs_padded
]
dcp_local_seq_lens
,
dcp_local_seq_lens_cpu
=
None
,
None
if
self
.
dcp_world_size
>
1
:
dcp_local_seq_lens
=
self
.
dcp_local_seq_lens
.
gpu
[:
num_reqs_padded
]
dcp_local_seq_lens_cpu
=
self
.
dcp_local_seq_lens
.
cpu
[:
num_reqs_padded
]
spec_decode_common_attn_metadata
=
None
kv_cache_groups
=
self
.
kv_cache_config
.
kv_cache_groups
# Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata.
for
kv_cache_gid
,
kv_cache_group
in
enumerate
(
self
.
kv_cache_config
.
kv_cache_groups
):
encoder_seq_lens
,
encoder_seq_lens_cpu
=
self
.
_get_encoder_seq_lens
(
num_scheduled_tokens
or
{},
kv_cache_group
.
kv_cache_spec
,
num_reqs_padded
,
)
if
isinstance
(
kv_cache_group
.
kv_cache_spec
,
EncoderOnlyAttentionSpec
):
# Encoder-only layers do not have KV cache, so we need to
# create a dummy block table and slot mapping for them.
def
_get_block_table_and_slot_mapping
(
kv_cache_gid
:
int
):
assert
num_reqs_padded
is
not
None
and
num_tokens_padded
is
not
None
kv_cache_spec
=
kv_cache_groups
[
kv_cache_gid
].
kv_cache_spec
if
isinstance
(
kv_cache_spec
,
EncoderOnlyAttentionSpec
):
blk_table_tensor
=
torch
.
zeros
(
(
num_reqs_padded
,
1
),
dtype
=
torch
.
int32
,
...
...
@@ -1617,92 +1582,129 @@ class GPUModelRunner(
blk_table_tensor
=
blk_table
.
get_device_tensor
(
num_reqs_padded
)
slot_mapping
=
blk_table
.
slot_mapping
.
gpu
[:
num_tokens_padded
]
# Fill unused with -1. Needed for reshape_and_cache in full cuda
# graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
slot_mapping
[
num_tokens
:
num_tokens_padded
].
fill_
(
-
1
)
blk_table_tensor
[
num_reqs
:
num_reqs_padded
].
fill_
(
-
1
)
common_attn_metadata
=
CommonAttentionMetadata
(
query_start_loc
=
query_start_loc
,
query_start_loc_cpu
=
query_start_loc_cpu
,
seq_lens
=
seq_lens
,
_seq_lens_cpu
=
seq_lens_cpu
,
_num_computed_tokens_cpu
=
num_computed_tokens_cpu
,
num_actual_tokens
=
num_tokens_padded
,
num_reqs
=
num_reqs_padded
,
max_query_len
=
max_query_len
,
max_seq_len
=
max_seq_len
,
block_table_tensor
=
blk_table_tensor
,
slot_mapping
=
slot_mapping
,
logits_indices_padded
=
logits_indices_padded
,
num_logits_indices
=
num_logits_indices
,
causal
=
True
,
encoder_seq_lens
=
encoder_seq_lens
,
encoder_seq_lens_cpu
=
encoder_seq_lens_cpu
,
dcp_local_seq_lens
=
dcp_local_seq_lens
,
dcp_local_seq_lens_cpu
=
dcp_local_seq_lens_cpu
,
# Fill unused with -1. Needed for reshape_and_cache in full cuda
# graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
slot_mapping
[
num_tokens
:
num_tokens_padded
].
fill_
(
-
1
)
blk_table_tensor
[
num_reqs
:
num_reqs_padded
].
fill_
(
-
1
)
return
blk_table_tensor
,
slot_mapping
block_table_gid_0
,
slot_mapping_gid_0
=
_get_block_table_and_slot_mapping
(
0
)
cm_base
=
CommonAttentionMetadata
(
query_start_loc
=
self
.
query_start_loc
.
gpu
[:
num_reqs_padded
+
1
],
query_start_loc_cpu
=
self
.
query_start_loc
.
cpu
[:
num_reqs_padded
+
1
],
seq_lens
=
self
.
seq_lens
.
gpu
[:
num_reqs_padded
],
_seq_lens_cpu
=
self
.
seq_lens
.
cpu
[:
num_reqs_padded
],
_num_computed_tokens_cpu
=
self
.
input_batch
.
num_computed_tokens_cpu_tensor
[
:
num_reqs_padded
],
num_reqs
=
num_reqs_padded
,
num_actual_tokens
=
num_tokens_padded
,
max_query_len
=
max_query_len
,
max_seq_len
=
max_seq_len
,
block_table_tensor
=
block_table_gid_0
,
slot_mapping
=
slot_mapping_gid_0
,
causal
=
True
,
)
if
self
.
dcp_world_size
>
1
:
self
.
dcp_local_seq_lens
.
cpu
[:
num_reqs
]
=
get_dcp_local_seq_lens
(
self
.
seq_lens
.
cpu
[:
num_reqs
],
self
.
dcp_world_size
,
self
.
dcp_rank
,
self
.
parallel_config
.
cp_kv_cache_interleave_size
,
)
self
.
dcp_local_seq_lens
.
cpu
[
num_reqs
:].
fill_
(
0
)
self
.
dcp_local_seq_lens
.
copy_to_gpu
(
num_reqs_padded
)
cm_base
.
dcp_local_seq_lens
=
self
.
dcp_local_seq_lens
.
gpu
[:
num_reqs_padded
]
cm_base
.
dcp_local_seq_lens_cpu
=
self
.
dcp_local_seq_lens
.
cpu
[
:
num_reqs_padded
]
if
logits_indices
is
not
None
and
self
.
cache_config
.
kv_sharing_fast_prefill
:
cm_base
.
num_logits_indices
=
logits_indices
.
size
(
0
)
cm_base
.
logits_indices_padded
=
self
.
_prepare_kv_sharing_fast_prefill
(
logits_indices
)
def
_build_attn_group_metadata
(
kv_cache_gid
:
int
,
attn_gid
:
int
,
common_attn_metadata
:
CommonAttentionMetadata
,
ubid
:
int
|
None
=
None
,
)
->
None
:
attn_group
=
self
.
attn_groups
[
kv_cache_gid
][
attn_gid
]
cascade_attn_prefix_len
=
(
cascade_attn_prefix_lens
[
kv_cache_gid
][
attn_gid
]
if
cascade_attn_prefix_lens
else
0
)
builder
=
attn_group
.
get_metadata_builder
(
ubid
or
0
)
extra_attn_metadata_args
=
{}
if
use_spec_decode
and
isinstance
(
builder
,
GDNAttentionMetadataBuilder
):
assert
ubid
is
None
,
"UBatching not supported with GDN yet"
extra_attn_metadata_args
=
dict
(
num_accepted_tokens
=
self
.
num_accepted_tokens
.
gpu
[:
num_reqs_padded
],
num_decode_draft_tokens_cpu
=
self
.
num_decode_draft_tokens
.
cpu
[
:
num_reqs_padded
],
)
if
for_cudagraph_capture
:
attn_metadata_i
=
builder
.
build_for_cudagraph_capture
(
common_attn_metadata
)
else
:
attn_metadata_i
=
builder
.
build
(
common_prefix_len
=
cascade_attn_prefix_len
,
common_attn_metadata
=
common_attn_metadata
,
**
extra_attn_metadata_args
,
)
if
ubid
is
None
:
assert
isinstance
(
attn_metadata
,
dict
)
attn_metadata_dict
=
attn_metadata
else
:
assert
isinstance
(
attn_metadata
,
list
)
attn_metadata_dict
=
attn_metadata
[
ubid
]
for
layer_name
in
attn_group
.
layer_names
:
attn_metadata_dict
[
layer_name
]
=
attn_metadata_i
# Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata.
spec_decode_common_attn_metadata
=
None
for
kv_cache_gid
,
kv_cache_group
in
enumerate
(
kv_cache_groups
):
cm
=
copy
(
cm_base
)
# shallow copy
# Basically only the encoder seq_lens, block_table and slot_mapping change
# for each kv_cache_group.
cm
.
encoder_seq_lens
,
cm
.
encoder_seq_lens_cpu
=
self
.
_get_encoder_seq_lens
(
num_scheduled_tokens
or
{},
kv_cache_group
.
kv_cache_spec
,
num_reqs_padded
,
)
if
kv_cache_gid
>
0
:
cm
.
block_table_tensor
,
cm
.
slot_mapping
=
(
_get_block_table_and_slot_mapping
(
kv_cache_gid
)
)
if
self
.
speculative_config
and
spec_decode_common_attn_metadata
is
None
:
if
isinstance
(
self
.
drafter
,
EagleProposer
):
if
self
.
drafter
.
attn_layer_names
[
0
]
in
kv_cache_group
.
layer_names
:
spec_decode_common_attn_metadata
=
c
ommon_attn_metadata
spec_decode_common_attn_metadata
=
c
m
else
:
spec_decode_common_attn_metadata
=
common_attn_metadata
for
attn_gid
,
attn_group
in
enumerate
(
self
.
attn_groups
[
kv_cache_gid
]):
cascade_attn_prefix_len
=
(
cascade_attn_prefix_lens
[
kv_cache_gid
][
attn_gid
]
if
cascade_attn_prefix_lens
else
0
)
builder
=
attn_group
.
get_metadata_builder
()
extra_attn_metadata_args
=
{}
if
use_spec_decode
and
isinstance
(
builder
,
GDNAttentionMetadataBuilder
):
extra_attn_metadata_args
=
dict
(
num_accepted_tokens
=
self
.
num_accepted_tokens
.
gpu
[
:
num_reqs_padded
],
num_decode_draft_tokens_cpu
=
self
.
num_decode_draft_tokens
.
cpu
[
:
num_reqs_padded
],
)
spec_decode_common_attn_metadata
=
cm
for
attn_gid
in
range
(
len
(
self
.
attn_groups
[
kv_cache_gid
])):
if
ubatch_slices
is
not
None
:
common_attn_metadata_list
=
split_attn_metadata
(
ubatch_slices
,
common_attn_metadata
)
for
ubid
,
common_attn_metadata
in
enumerate
(
common_attn_metadata_list
):
builder
=
attn_group
.
get_metadata_builder
(
ubatch_id
=
ubid
)
if
for_cudagraph_capture
:
attn_metadata_i
=
builder
.
build_for_cudagraph_capture
(
common_attn_metadata
)
else
:
attn_metadata_i
=
builder
.
build
(
common_prefix_len
=
cascade_attn_prefix_len
,
common_attn_metadata
=
common_attn_metadata
,
)
for
layer_name
in
kv_cache_group
.
layer_names
:
assert
type
(
attn_metadata
)
is
list
attn_metadata
[
ubid
][
layer_name
]
=
attn_metadata_i
for
ubid
,
_cm
in
enumerate
(
split_attn_metadata
(
ubatch_slices
,
cm
)):
_build_attn_group_metadata
(
kv_cache_gid
,
attn_gid
,
_cm
,
ubid
)
else
:
assert
isinstance
(
attn_metadata
,
dict
)
if
for_cudagraph_capture
:
attn_metadata_i
=
builder
.
build_for_cudagraph_capture
(
common_attn_metadata
)
else
:
attn_metadata_i
=
builder
.
build
(
common_prefix_len
=
cascade_attn_prefix_len
,
common_attn_metadata
=
common_attn_metadata
,
**
extra_attn_metadata_args
,
)
for
layer_name
in
attn_group
.
layer_names
:
attn_metadata
[
layer_name
]
=
attn_metadata_i
_build_attn_group_metadata
(
kv_cache_gid
,
attn_gid
,
cm
)
if
self
.
is_mm_prefix_lm
:
req_doc_ranges
=
{}
...
...
@@ -2183,10 +2185,7 @@ class GPUModelRunner(
# Cache the encoder outputs by mm_hash
for
(
mm_hash
,
pos_info
),
output
in
zip
(
mm_hashes_pos
,
encoder_outputs
):
self
.
encoder_cache
[
mm_hash
]
=
scatter_mm_placeholders
(
output
,
is_embed
=
pos_info
.
is_embed
,
)
self
.
encoder_cache
[
mm_hash
]
=
output
logger
.
debug
(
"Finish execute for mm hash %s"
,
mm_hash
)
self
.
maybe_save_ec_to_connector
(
self
.
encoder_cache
,
mm_hash
)
...
...
@@ -2237,6 +2236,13 @@ class GPUModelRunner(
num_encoder_tokens
,
)
assert
start_idx
<
end_idx
curr_embeds_start
,
curr_embeds_end
=
(
pos_info
.
get_embeds_indices_in_range
(
start_idx
,
end_idx
)
)
# If there are no embeddings in the current range, we skip
# gathering the embeddings.
if
curr_embeds_start
==
curr_embeds_end
:
continue
mm_hash
=
mm_feature
.
identifier
encoder_output
=
self
.
encoder_cache
.
get
(
mm_hash
,
None
)
...
...
@@ -2244,16 +2250,14 @@ class GPUModelRunner(
if
(
is_embed
:
=
pos_info
.
is_embed
)
is
not
None
:
is_embed
=
is_embed
[
start_idx
:
end_idx
]
mm_embeds_item
=
encoder_output
[
curr_embeds_start
:
curr_embeds_end
]
else
:
mm_embeds_item
=
encoder_output
[
start_idx
:
end_idx
]
req_start_pos
=
req_start_idx
+
start_pos
-
num_computed_tokens
is_mm_embed
[
req_start_pos
+
start_idx
:
req_start_pos
+
end_idx
]
=
(
True
if
is_embed
is
None
else
is_embed
)
mm_embeds_item
=
gather_mm_placeholders
(
encoder_output
[
start_idx
:
end_idx
],
is_embed
=
is_embed
,
)
mm_embeds_req
.
append
(
mm_embeds_item
)
if
self
.
is_multimodal_pruning_enabled
and
self
.
uses_mrope
:
...
...
@@ -2764,6 +2768,7 @@ class GPUModelRunner(
# be improved in model runner v2)
force_uniform_decode
:
bool
|
None
=
None
,
force_has_lora
:
bool
|
None
=
None
,
num_encoder_reqs
:
int
=
0
,
)
->
tuple
[
CUDAGraphMode
,
BatchDescriptor
,
...
...
@@ -2780,6 +2785,11 @@ class GPUModelRunner(
if
force_uniform_decode
is
None
else
force_uniform_decode
)
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
# is present). Also, chunked-prefill is disabled, so batch are uniform.
has_encoder_output
=
(
self
.
model_config
.
is_encoder_decoder
and
num_encoder_reqs
>
0
)
has_lora
=
(
len
(
self
.
input_batch
.
lora_id_to_lora_request
)
>
0
...
...
@@ -2799,7 +2809,7 @@ class GPUModelRunner(
)
cudagraph_mode
,
batch_descriptor
=
dispatch_cudagraph
(
num_tokens_padded
,
use_cascade_attn
num_tokens_padded
,
use_cascade_attn
or
has_encoder_output
)
num_tokens_padded
=
batch_descriptor
.
num_tokens
...
...
@@ -2997,6 +3007,7 @@ class GPUModelRunner(
num_scheduled_tokens_np
=
num_scheduled_tokens_np
,
max_num_scheduled_tokens
=
max_num_scheduled_tokens
,
use_cascade_attn
=
cascade_attn_prefix_lens
is
not
None
,
num_encoder_reqs
=
len
(
scheduler_output
.
scheduled_encoder_inputs
),
)
logger
.
debug
(
...
...
@@ -3562,74 +3573,89 @@ class GPUModelRunner(
if
self
.
parallel_config
.
enable_eplb
:
self
.
eplb_state
=
EplbState
(
self
.
parallel_config
,
self
.
device
)
eplb_models
=
0
with
DeviceMemoryProfiler
()
as
m
:
time_before_load
=
time
.
perf_counter
()
model_loader
=
get_model_loader
(
self
.
load_config
)
self
.
model
=
model_loader
.
load_model
(
vllm_config
=
self
.
vllm_config
,
model_config
=
self
.
model_config
)
if
self
.
lora_config
:
self
.
model
=
self
.
load_lora_model
(
self
.
model
,
self
.
vllm_config
,
self
.
device
try
:
with
DeviceMemoryProfiler
()
as
m
:
time_before_load
=
time
.
perf_counter
()
model_loader
=
get_model_loader
(
self
.
load_config
)
self
.
model
=
model_loader
.
load_model
(
vllm_config
=
self
.
vllm_config
,
model_config
=
self
.
model_config
)
if
hasattr
(
self
,
"drafter"
):
logger
.
info_once
(
"Loading drafter model..."
)
self
.
drafter
.
load_model
(
self
.
model
)
if
(
hasattr
(
self
.
drafter
,
"model"
)
and
is_mixture_of_experts
(
self
.
drafter
.
model
)
and
self
.
parallel_config
.
enable_eplb
):
spec_config
=
self
.
vllm_config
.
speculative_config
assert
spec_config
is
not
None
assert
spec_config
.
draft_model_config
is
not
None
logger
.
info_once
(
"EPLB is enabled for drafter model %s."
,
spec_config
.
draft_model_config
.
model
,
if
self
.
lora_config
:
self
.
model
=
self
.
load_lora_model
(
self
.
model
,
self
.
vllm_config
,
self
.
device
)
if
hasattr
(
self
,
"drafter"
):
logger
.
info_once
(
"Loading drafter model..."
)
self
.
drafter
.
load_model
(
self
.
model
)
if
(
hasattr
(
self
.
drafter
,
"model"
)
and
is_mixture_of_experts
(
self
.
drafter
.
model
)
and
self
.
parallel_config
.
enable_eplb
):
spec_config
=
self
.
vllm_config
.
speculative_config
assert
spec_config
is
not
None
assert
spec_config
.
draft_model_config
is
not
None
logger
.
info_once
(
"EPLB is enabled for drafter model %s."
,
spec_config
.
draft_model_config
.
model
,
)
global_expert_load
=
(
global_expert_loads
[
eplb_models
]
if
global_expert_loads
else
None
)
old_global_expert_indices
=
(
old_global_expert_indices_per_model
[
eplb_models
]
if
old_global_expert_indices_per_model
else
None
)
if
self
.
eplb_state
is
None
:
self
.
eplb_state
=
EplbState
(
self
.
parallel_config
,
self
.
device
)
self
.
eplb_state
.
add_model
(
self
.
drafter
.
model
,
spec_config
.
draft_model_config
,
global_expert_load
,
old_global_expert_indices
,
rank_mapping
,
)
eplb_models
+=
1
global_expert_load
=
(
global_expert_loads
[
eplb_models
]
if
global_expert_loads
else
None
)
old_global_expert_indices
=
(
old_global_expert_indices_per_model
[
eplb_models
]
if
old_global_expert_indices_per_model
else
None
)
if
self
.
eplb_state
is
None
:
self
.
eplb_state
=
EplbState
(
self
.
parallel_config
,
self
.
device
)
self
.
eplb_state
.
add_model
(
self
.
drafter
.
model
,
spec_config
.
draft_model_config
,
global_expert_load
,
old_global_expert_indices
,
rank_mapping
,
)
eplb_models
+=
1
if
self
.
use_aux_hidden_state_outputs
:
if
not
supports_eagle3
(
self
.
get_model
()):
raise
RuntimeError
(
"Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested"
)
if
self
.
use_aux_hidden_state_outputs
:
if
not
supports_eagle3
(
self
.
get_model
()):
raise
RuntimeError
(
"Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested"
)
# Try to get auxiliary layers from speculative config,
# otherwise use model's default layers
aux_layers
=
self
.
_get_eagle3_aux_layers_from_config
()
if
aux_layers
:
logger
.
info
(
"Using auxiliary layers from speculative config: %s"
,
aux_layers
,
)
else
:
aux_layers
=
self
.
model
.
get_eagle3_aux_hidden_state_layers
()
# Try to get auxiliary layers from speculative config,
# otherwise use model's default layers
aux_layers
=
self
.
_get_eagle3_aux_layers_from_config
()
if
aux_layers
:
logger
.
info
(
"Using auxiliary layers from speculative config: %s"
,
aux_layers
,
)
else
:
aux_layers
=
self
.
model
.
get_eagle3_aux_hidden_state_layers
()
self
.
model
.
set_aux_hidden_state_layers
(
aux_layers
)
time_after_load
=
time
.
perf_counter
()
self
.
model_memory_usage
=
m
.
consumed_memory
self
.
model
.
set_aux_hidden_state_layers
(
aux_layers
)
time_after_load
=
time
.
perf_counter
()
self
.
model_memory_usage
=
m
.
consumed_memory
except
torch
.
cuda
.
OutOfMemoryError
as
e
:
msg
=
(
"Failed to load model - not enough GPU memory. "
"Try lowering --gpu-memory-utilization to free memory for weights, "
"increasing --tensor-parallel-size, or using --quantization. "
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
"for more tips."
)
combined_msg
=
f
"
{
msg
}
(original error:
{
e
}
)"
logger
.
error
(
combined_msg
)
raise
e
logger
.
info_once
(
"Model loading took %.4f GiB memory and %.6f seconds"
,
self
.
model_memory_usage
/
GiB_bytes
,
...
...
@@ -3867,19 +3893,21 @@ class GPUModelRunner(
return
{}
@
contextmanager
def
maybe_randomize_inputs
(
self
,
input_ids
:
torch
.
Tensor
):
def
maybe_randomize_inputs
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
):
"""
Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
This is to help balance expert-selection
- during profile_run
- during DP rank dummy run
"""
dp_size
=
self
.
vllm_config
.
parallel_config
.
data_parallel_size
randomize_inputs
=
envs
.
VLLM_RANDOMIZE_DP_DUMMY_INPUTS
and
dp_size
>
1
if
not
randomize_inputs
:
yield
else
:
import
functools
elif
input_ids
is
not
None
:
@
functools
.
cache
def
rand_input_ids
()
->
torch
.
Tensor
:
...
...
@@ -3887,13 +3915,27 @@ class GPUModelRunner(
self
.
input_ids
.
gpu
,
low
=
0
,
high
=
self
.
model_config
.
get_vocab_size
(),
dtype
=
input_ids
.
dtype
,
)
logger
.
debug_once
(
"Randomizing dummy
data
for DP Rank"
)
logger
.
debug_once
(
"Randomizing dummy
input_ids
for DP Rank"
)
input_ids
.
copy_
(
rand_input_ids
()[:
input_ids
.
size
(
0
)],
non_blocking
=
True
)
yield
input_ids
.
fill_
(
0
)
else
:
@
functools
.
cache
def
rand_inputs_embeds
()
->
torch
.
Tensor
:
return
torch
.
randn_like
(
self
.
inputs_embeds
.
gpu
,
)
assert
inputs_embeds
is
not
None
logger
.
debug_once
(
"Randomizing dummy inputs_embeds for DP Rank"
)
inputs_embeds
.
copy_
(
rand_inputs_embeds
()[:
inputs_embeds
.
size
(
0
)],
non_blocking
=
True
)
yield
inputs_embeds
.
fill_
(
0
)
def
_get_mm_dummy_batch
(
self
,
...
...
@@ -4142,7 +4184,7 @@ class GPUModelRunner(
num_tokens_across_dp
[:]
=
num_tokens_padded
with
(
self
.
maybe_randomize_inputs
(
input_ids
),
self
.
maybe_randomize_inputs
(
input_ids
,
inputs_embeds
),
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
...
...
@@ -4425,31 +4467,8 @@ class GPUModelRunner(
dummy_encoder_outputs
,
expected_num_items
=
max_mm_items_per_batch
,
)
# NOTE: This happens when encoder cache needs to store
# the embeddings that encoder outputs are scattered onto.
# In this case we create dummy embeddings of size
# (max_tokens_for_modality, hidden_size) and scatter
# encoder output into it.
encoder_output_shape
=
dummy_encoder_outputs
[
0
].
shape
max_mm_tokens_per_item
=
mm_budget
.
max_tokens_by_modality
[
dummy_modality
]
if
encoder_output_shape
[
0
]
<
max_mm_tokens_per_item
:
encoder_hidden_size
=
encoder_output_shape
[
-
1
]
expanded_outputs
=
[]
for
output
in
dummy_encoder_outputs
:
expanded
=
output
.
new_zeros
(
(
max_mm_tokens_per_item
,
encoder_hidden_size
)
)
num_tokens
=
output
.
shape
[
0
]
expanded
[:
num_tokens
].
copy_
(
output
)
expanded_outputs
.
append
(
expanded
)
dummy_encoder_outputs
=
expanded_outputs
# Cache the dummy encoder outputs.
self
.
encoder_cache
[
"tmp"
]
=
dict
(
enumerate
(
dummy_encoder_outputs
))
for
i
,
output
in
enumerate
(
dummy_encoder_outputs
):
self
.
encoder_cache
[
f
"tmp_
{
i
}
"
]
=
output
# Add `is_profile` here to pre-allocate communication buffers
hidden_states
,
last_hidden_states
=
self
.
_dummy_run
(
...
...
@@ -4557,6 +4576,10 @@ class GPUModelRunner(
# after here.
set_cudagraph_capturing_enabled
(
False
)
# Lock workspace to prevent resizing during execution.
# Max workspace sizes should have been captured during warmup/profiling.
lock_workspace
()
end_time
=
time
.
perf_counter
()
elapsed_time
=
end_time
-
start_time
cuda_graph_size
=
start_free_gpu_memory
-
end_free_gpu_memory
...
...
@@ -4712,6 +4735,9 @@ class GPUModelRunner(
attention_backend_list
,
kv_cache_config
.
kv_cache_groups
)
# Check if attention backend supports PCP&DCP and related features.
check_attention_cp_compatibility
(
self
.
vllm_config
)
for
i
,
attn_backend_map
in
enumerate
(
attention_backend_maps
):
self
.
attn_groups
.
append
(
create_attn_groups
(
attn_backend_map
,
i
))
...
...
@@ -4871,7 +4897,7 @@ class GPUModelRunner(
# we need to adjust the cudagraph sizes to be a multiple of the uniform
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
# Will be removed in the near future when we have sep
e
rate cudagraph capture
# Will be removed in the near future when we have sep
a
rate cudagraph capture
# sizes for decode and mixed prefill-decode.
if
(
cudagraph_mode
.
decode_mode
()
==
CUDAGraphMode
.
FULL
...
...
@@ -5370,20 +5396,6 @@ class GPUModelRunner(
kv_transfer_group
.
register_kv_caches
(
kv_caches
)
kv_transfer_group
.
set_host_xfer_buffer_ops
(
copy_kv_blocks
)
if
self
.
dcp_world_size
>
1
:
layer_type
=
cast
(
type
[
Any
],
AttentionLayerBase
)
layers
=
get_layers_from_vllm_config
(
self
.
vllm_config
,
layer_type
)
for
layer
in
layers
.
values
():
layer_impl
=
getattr
(
layer
,
"impl"
,
None
)
if
layer_impl
is
None
:
continue
assert
layer_impl
.
need_to_return_lse_for_decode
,
(
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f
"
{
layer_impl
.
__class__
.
__name__
}
"
"does not return the softmax lse for decode."
)
def
may_add_encoder_only_layers_to_kv_cache_config
(
self
)
->
None
:
"""
Add encoder-only layers to the KV cache config.
...
...
vllm/v1/worker/gpu_worker.py
View file @
a3f8d5dd
...
...
@@ -54,6 +54,7 @@ from vllm.v1.outputs import (
from
vllm.v1.utils
import
report_usage_stats
from
vllm.v1.worker.utils
import
is_residual_scattered_for_sp
from
vllm.v1.worker.worker_base
import
WorkerBase
from
vllm.v1.worker.workspace
import
init_workspace_manager
logger
=
init_logger
(
__name__
)
...
...
@@ -81,7 +82,7 @@ class Worker(WorkerBase):
# configure float32 matmul precision according to vLLM env.
precision
=
envs
.
VLLM_FLOAT32_MATMUL_PRECISION
torch
.
set_float32_
matmul_precision
(
precision
)
torch
.
backends
.
cuda
.
matmul
.
fp32
_precision
=
precision
if
self
.
model_config
.
trust_remote_code
:
# note: lazy import to avoid importing torch before initializing
...
...
@@ -255,6 +256,10 @@ class Worker(WorkerBase):
else
:
raise
RuntimeError
(
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
# Initialize workspace manager
num_ubatches
=
2
if
self
.
vllm_config
.
parallel_config
.
enable_dbo
else
1
init_workspace_manager
(
self
.
device
,
num_ubatches
)
# Construct the model runner
if
self
.
use_v2_model_runner
:
from
vllm.v1.worker.gpu.model_runner
import
(
...
...
@@ -926,10 +931,11 @@ def init_worker_distributed_environment(
backend
:
str
=
"nccl"
,
)
->
None
:
"""Initialize the distributed environment."""
attention_config
=
vllm_config
.
attention_config
parallel_config
=
vllm_config
.
parallel_config
from
vllm.model_executor.layers.batch_invariant
import
init_batch_invariance
init_batch_invariance
()
init_batch_invariance
(
attention_config
.
backend
)
set_custom_all_reduce
(
not
parallel_config
.
disable_custom_all_reduce
)
init_method
=
distributed_init_method
or
"env://"
...
...
vllm/v1/worker/kv_connector_model_runner_mixin.py
View file @
a3f8d5dd
...
...
@@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import (
has_kv_transfer_group
,
)
from
vllm.distributed.kv_transfer.kv_connector.base
import
KVConnectorBase
from
vllm.distributed.kv_transfer.kv_connector.v1.metrics
import
KVConnectorStats
from
vllm.forward_context
import
get_forward_context
,
set_forward_context
from
vllm.logger
import
init_logger
from
vllm.v1.kv_cache_interface
import
AttentionSpec
,
KVCacheConfig
...
...
@@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin:
)
output
.
invalid_block_ids
=
kv_connector
.
get_block_ids_with_load_errors
()
output
.
kv_connector_stats
=
(
KVConnectorModelRunnerMixin
.
get_kv_connector_stats
()
)
kv_connector
.
clear_connector_metadata
()
output
.
kv_connector_stats
=
kv_connector
.
get_kv_connector_stats
()
output
.
kv_cache_events
=
kv_connector
.
get_kv_connector_kv_cache_events
()
@
staticmethod
def
get_kv_connector_stats
()
->
KVConnectorStats
|
None
:
if
has_kv_transfer_group
():
return
get_kv_transfer_group
().
get_kv_connector_stats
()
return
None
kv_connector
.
clear_connector_metadata
()
@
staticmethod
def
use_uniform_kv_cache
(
...
...
vllm/v1/worker/tpu_worker.py
View file @
a3f8d5dd
...
...
@@ -10,7 +10,7 @@ import torch
import
torch.nn
as
nn
import
vllm.envs
as
envs
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
,
...
...
@@ -207,7 +207,8 @@ class TPUWorker:
# one compiled bytecode. Having one FX graph/cached bytecode per
# compiled model is required for `support_torch_compile` decorator to
# skip dynamo guard.
self
.
model_runner
.
reset_dynamo_cache
()
with
set_current_vllm_config
(
self
.
vllm_config
):
self
.
model_runner
.
reset_dynamo_cache
()
# Get the maximum amount of memory used by the model weights and
# intermediate activations.
...
...
vllm/v1/worker/utils.py
View file @
a3f8d5dd
...
...
@@ -4,10 +4,12 @@ from collections import defaultdict
from
dataclasses
import
dataclass
,
field
import
torch
from
typing_extensions
import
deprecated
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.attention.layer
import
Attention
from
vllm.config
import
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.models.interfaces
import
MultiModalEmbeddings
from
vllm.model_executor.models.utils
import
extract_layer_index
from
vllm.multimodal.cache
import
processor_only_cache_from_config
...
...
@@ -17,6 +19,8 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from
vllm.v1.core.encoder_cache_manager
import
compute_mm_encoder_budget
from
vllm.v1.kv_cache_interface
import
KVCacheGroupSpec
,
KVCacheSpec
logger
=
init_logger
(
__name__
)
class
MultiModalBudget
:
"""Helper class to calculate budget information for multi-modal models."""
...
...
@@ -135,7 +139,7 @@ class AttentionGroup:
kv_cache_spec
:
KVCacheSpec
kv_cache_group_id
:
int
# When ubatching is enabled we will have a metadata builder for each ubatch
# so that if they use internal persist
a
nt buffers for cudagraphs, and they
# so that if they use internal persist
e
nt buffers for cudagraphs, and they
# won't have to worry about conflicting with the other ubatches.
metadata_builders
:
list
[
AttentionMetadataBuilder
]
=
field
(
default_factory
=
lambda
:
[]
...
...
@@ -198,6 +202,7 @@ def sanity_check_mm_encoder_outputs(
)
@
deprecated
(
"`scatter_mm_placeholders` is deprecated and will be removed in v0.15.0."
)
def
scatter_mm_placeholders
(
embeds
:
torch
.
Tensor
,
is_embed
:
torch
.
Tensor
|
None
,
...
...
@@ -226,6 +231,7 @@ def scatter_mm_placeholders(
return
placeholders
@
deprecated
(
"`gather_mm_placeholders` is deprecated and will be removed in v0.15.0."
)
def
gather_mm_placeholders
(
placeholders
:
torch
.
Tensor
,
is_embed
:
torch
.
Tensor
|
None
,
...
...
@@ -313,8 +319,12 @@ def bind_kv_cache(
# TODO - analyze where runner_kv_caches is used and the right
# way to ensure it properly reflects multiple attention layers
# in the same decoder block.
if
current_platform
.
is_cuda_alike
()
or
current_platform
.
is_xpu
():
# We know that the GPU runner is not impacted by this
if
(
current_platform
.
is_cuda_alike
()
or
current_platform
.
is_xpu
()
or
current_platform
.
is_cpu
()
):
# We know that the GPU / CPU runner is not impacted by this
# case. Some test code depends on runner_kv_caches, but
# not in a way that's impacted by ignoring this.
pass
...
...
vllm/v1/worker/workspace.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
inspect
import
os
from
itertools
import
accumulate
from
math
import
prod
from
typing
import
Optional
import
torch
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.utils.math_utils
import
round_up
from
vllm.v1.worker.ubatching
import
dbo_current_ubatch_id
logger
=
init_logger
(
__name__
)
def
_compute_bytes
(
shape
:
tuple
[
int
,
...],
dtype
:
torch
.
dtype
)
->
int
:
return
prod
(
shape
)
*
dtype
.
itemsize
# Constants
_MB
=
1024
**
2
_GiB
=
1024
**
3
# Global workspace manager instance
_manager
:
Optional
[
"WorkspaceManager"
]
=
None
class
WorkspaceManager
:
"""Manager for workspace allocation.
Manages workspace buffers for DBO (Dual Batch Overlap) execution.
Can be locked to prevent further growth during execution.
"""
def
__init__
(
self
,
device
:
torch
.
device
,
num_ubatches
:
int
|
None
=
None
):
self
.
_device
=
device
# Cache num ubatches at init based on configuration (default to 1)
self
.
_num_ubatches
=
num_ubatches
if
num_ubatches
is
not
None
else
1
self
.
_current_workspaces
:
list
[
torch
.
Tensor
|
None
]
=
[
None
,
None
]
self
.
_locked
:
bool
=
False
@
staticmethod
def
_workspace_size_bytes
(
workspace
:
torch
.
Tensor
|
None
)
->
int
:
"""Get size of workspace in bytes."""
if
workspace
is
None
:
return
0
return
workspace
.
numel
()
*
workspace
.
element_size
()
def
lock
(
self
)
->
None
:
"""Lock the workspace to prevent further growth.
After locking, any attempt to allocate a larger workspace will raise
an assertion error. This ensures workspace size is fixed during execution.
"""
self
.
_locked
=
True
if
envs
.
VLLM_DEBUG_WORKSPACE
:
logger
.
info
(
"[WORKSPACE DEBUG] Workspace locked. Current sizes: %s"
,
[
self
.
_workspace_size_bytes
(
ws
)
/
_MB
for
ws
in
self
.
_current_workspaces
if
ws
is
not
None
],
)
def
is_locked
(
self
)
->
bool
:
"""Check if workspace is locked."""
return
self
.
_locked
def
get_simultaneous
(
self
,
*
shapes_and_dtypes
:
tuple
[
tuple
[
int
,
...],
torch
.
dtype
]
)
->
list
[
torch
.
Tensor
]:
"""Get multiple workspace tensors simultaneously from a single allocation.
Args:
*shapes_and_dtypes: One or more (shape, dtype) tuples.
Returns:
List of tensor views into the workspace buffer, one per shape/dtype pair.
"""
actual_bytes
=
[
_compute_bytes
(
s
,
d
)
for
s
,
d
in
shapes_and_dtypes
]
aligned_bytes
=
[
round_up
(
actual
,
256
)
for
actual
in
actual_bytes
]
total_bytes
=
sum
(
aligned_bytes
)
# Calculate cumulative offsets using itertools.accumulate
offsets
=
list
(
accumulate
([
0
]
+
aligned_bytes
[:
-
1
]))
current_workspace
=
self
.
_ensure_workspace_size
(
total_bytes
)
return
[
current_workspace
[
offsets
[
i
]
:
offsets
[
i
]
+
actual_bytes
[
i
]]
.
view
(
shapes_and_dtypes
[
i
][
1
])
.
reshape
(
shapes_and_dtypes
[
i
][
0
])
for
i
in
range
(
len
(
shapes_and_dtypes
))
]
def
_ensure_workspace_size
(
self
,
required_bytes
:
int
)
->
torch
.
Tensor
:
"""Ensure workspace is allocated and large enough, return current workspace.
Args:
required_bytes: The number of bytes required.
Returns:
The current workspace tensor.
"""
ubatch_id
=
dbo_current_ubatch_id
()
current_workspace
=
self
.
_current_workspaces
[
ubatch_id
]
current_size
=
self
.
_workspace_size_bytes
(
current_workspace
)
if
current_size
<
required_bytes
:
def
get_caller_info
()
->
str
:
"""Find first frame outside WorkspaceManager."""
curr_frame
=
inspect
.
currentframe
()
if
curr_frame
is
None
:
return
"unknown"
# Walk up the stack skipping WorkspaceManager frames
curr_frame
=
curr_frame
.
f_back
while
curr_frame
is
not
None
:
# TODO: This only catches instance methods (self), missing
# classmethods and staticmethods. Once Python 3.11+ is the
# minimum supported version, use co_qualname instead:
# qualname = curr_frame.f_code.co_qualname
# if qualname.startswith("WorkspaceManager."):
if
isinstance
(
curr_frame
.
f_locals
.
get
(
"self"
),
WorkspaceManager
):
curr_frame
=
curr_frame
.
f_back
continue
filename
=
os
.
path
.
basename
(
curr_frame
.
f_code
.
co_filename
)
return
(
f
"
{
filename
}
:
{
curr_frame
.
f_lineno
}
:
{
curr_frame
.
f_code
.
co_name
}
"
)
return
"unknown"
if
self
.
_locked
:
raise
AssertionError
(
f
"Workspace is locked but allocation from '
{
get_caller_info
()
}
' "
f
"requires
{
required_bytes
/
_MB
:.
2
f
}
MB, current size is "
f
"
{
current_size
/
_MB
:.
2
f
}
MB. "
"Workspace growth is not allowed after locking."
)
for
ubatch_id
in
range
(
self
.
_num_ubatches
):
current_workspace
=
self
.
_current_workspaces
[
ubatch_id
]
if
current_workspace
is
None
:
self
.
_current_workspaces
[
ubatch_id
]
=
torch
.
empty
(
(
required_bytes
,),
dtype
=
torch
.
uint8
,
device
=
self
.
_device
)
elif
self
.
_workspace_size_bytes
(
current_workspace
)
<
required_bytes
:
current_workspace
.
resize_
(
required_bytes
)
if
envs
.
VLLM_DEBUG_WORKSPACE
:
logger
.
info
(
"[WORKSPACE DEBUG] Resized workspace from '%s': %.2f MB -> "
"%.2f MB (%d ubatches, total memory %.2f MB)"
,
get_caller_info
(),
current_size
/
_MB
,
required_bytes
/
_MB
,
self
.
_num_ubatches
,
required_bytes
*
self
.
_num_ubatches
/
_MB
,
)
current_workspace
=
self
.
_current_workspaces
[
dbo_current_ubatch_id
()]
return
current_workspace
def
is_workspace_manager_initialized
()
->
bool
:
"""Check if workspace manager has been initialized.
Returns:
True if workspace manager is initialized, False otherwise.
"""
return
_manager
is
not
None
def
current_workspace_manager
()
->
"WorkspaceManager"
:
"""Get the current workspace manager instance.
Raises:
AssertionError: If workspace manager has not been initialized.
"""
assert
_manager
is
not
None
,
(
"WorkspaceManager not initialized. Call init_workspace_manager() "
"with a device before using workspace functions."
)
return
_manager
def
init_workspace_manager
(
device
:
torch
.
device
,
num_ubatches
:
int
|
None
=
None
)
->
None
:
"""Initialize the workspace manager with a device.
Must be called before using any workspace functions. Typically called
from GPUModelRunner.__init__.
Args:
device: The device to allocate workspace on.
num_ubatches: Number of micro-batches. Defaults to 1.
"""
global
_manager
if
_manager
is
not
None
:
logger
.
warning
(
"WorkspaceManager already initialized on device %s, "
"reinitializing on device %s"
,
_manager
.
_device
,
device
,
)
_manager
=
WorkspaceManager
(
device
,
num_ubatches
)
def
lock_workspace
()
->
None
:
"""Lock the workspace to prevent further growth.
After calling this function, any attempt to allocate a workspace larger
than the current size will raise an AssertionError. This ensures that
workspace size is fixed during execution and prevents unexpected memory
allocations in the hot path.
Example:
# During initialization
init_workspace_manager(device)
reserve_workspace(shape1, dtype1)
reserve_workspace(shape2, dtype2)
# Lock after warmup/profiling
lock_workspace()
# Now all get_workspace calls must fit in pre-allocated size
"""
current_workspace_manager
().
lock
()
def
reset_workspace_manager
()
->
None
:
"""Reset the workspace manager to uninitialized state.
This is primarily intended for testing purposes to allow tests
to reinitialize the workspace manager cleanly.
"""
global
_manager
_manager
=
None
Prev
1
…
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment