Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4a80ad0a
Unverified
Commit
4a80ad0a
authored
Nov 28, 2025
by
Woosuk Kwon
Committed by
GitHub
Nov 28, 2025
Browse files
[Model Runner V2] Don't use UVA buffer for prefill_len (#29713)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
4b17ce68
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
1 deletion
+6
-1
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+2
-0
vllm/v1/worker/gpu/states.py
vllm/v1/worker/gpu/states.py
+4
-1
No files found.
vllm/v1/worker/gpu/model_runner.py
View file @
4a80ad0a
...
@@ -410,6 +410,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -410,6 +410,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
cu_num_new_blocks
[
i
].
append
(
x
+
len
(
block_ids
))
cu_num_new_blocks
[
i
].
append
(
x
+
len
(
block_ids
))
new_block_ids
[
i
].
extend
(
block_ids
)
new_block_ids
[
i
].
extend
(
block_ids
)
overwrite
.
append
(
True
)
overwrite
.
append
(
True
)
if
scheduler_output
.
scheduled_new_reqs
:
self
.
req_states
.
prefill_len
.
copy_to_gpu
()
# Add new blocks for the existing requests.
# Add new blocks for the existing requests.
cached_reqs
=
scheduler_output
.
scheduled_cached_reqs
cached_reqs
=
scheduler_output
.
scheduled_cached_reqs
...
...
vllm/v1/worker/gpu/states.py
View file @
4a80ad0a
...
@@ -117,7 +117,10 @@ class RequestState:
...
@@ -117,7 +117,10 @@ class RequestState:
self
.
prefill_token_ids
=
UvaBuffer
(
self
.
prefill_token_ids
=
UvaBuffer
(
self
.
max_num_reqs
,
self
.
max_model_len
,
dtype
=
torch
.
int32
self
.
max_num_reqs
,
self
.
max_model_len
,
dtype
=
torch
.
int32
)
)
self
.
prefill_len
=
UvaBuffer
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
)
# NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
# can be used outside of update_states and prepare_inputs.
# Without async barrier, using UVA can cause race conditions.
self
.
prefill_len
=
self
.
_make_buffer
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
)
# Number of computed tokens.
# Number of computed tokens.
self
.
num_computed_prefill_tokens
=
np
.
zeros
(
self
.
max_num_reqs
,
dtype
=
np
.
int32
)
self
.
num_computed_prefill_tokens
=
np
.
zeros
(
self
.
max_num_reqs
,
dtype
=
np
.
int32
)
self
.
num_computed_tokens
=
torch
.
zeros
(
self
.
num_computed_tokens
=
torch
.
zeros
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment