Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ee80aee1
Unverified
Commit
ee80aee1
authored
Nov 26, 2025
by
Woosuk Kwon
Committed by
GitHub
Nov 26, 2025
Browse files
[Model Runner V2] Minor cleanup for build_attn_metadata (#29576)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
0aeb698b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
13 additions
and
8 deletions
+13
-8
vllm/v1/worker/gpu/attn_utils.py
vllm/v1/worker/gpu/attn_utils.py
+3
-5
vllm/v1/worker/gpu/cudagraph_utils.py
vllm/v1/worker/gpu/cudagraph_utils.py
+2
-1
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+8
-2
No files found.
vllm/v1/worker/gpu/attn_utils.py
View file @
ee80aee1
...
@@ -18,7 +18,6 @@ from vllm.v1.kv_cache_interface import (
...
@@ -18,7 +18,6 @@ from vllm.v1.kv_cache_interface import (
KVCacheConfig
,
KVCacheConfig
,
KVCacheSpec
,
KVCacheSpec
,
)
)
from
vllm.v1.utils
import
CpuGpuBuffer
from
vllm.v1.worker.utils
import
bind_kv_cache
from
vllm.v1.worker.utils
import
bind_kv_cache
...
@@ -145,7 +144,8 @@ def build_attn_metadata(
...
@@ -145,7 +144,8 @@ def build_attn_metadata(
attn_metadata_builders
:
list
[
AttentionMetadataBuilder
],
attn_metadata_builders
:
list
[
AttentionMetadataBuilder
],
num_reqs
:
int
,
num_reqs
:
int
,
num_tokens
:
int
,
num_tokens
:
int
,
query_start_loc
:
CpuGpuBuffer
,
query_start_loc_gpu
:
torch
.
Tensor
,
query_start_loc_cpu
:
torch
.
Tensor
,
seq_lens
:
torch
.
Tensor
,
seq_lens
:
torch
.
Tensor
,
seq_lens_np
:
np
.
ndarray
,
seq_lens_np
:
np
.
ndarray
,
num_computed_tokens_cpu
:
torch
.
Tensor
|
None
,
num_computed_tokens_cpu
:
torch
.
Tensor
|
None
,
...
@@ -153,9 +153,7 @@ def build_attn_metadata(
...
@@ -153,9 +153,7 @@ def build_attn_metadata(
slot_mappings
:
torch
.
Tensor
,
slot_mappings
:
torch
.
Tensor
,
kv_cache_config
:
KVCacheConfig
,
kv_cache_config
:
KVCacheConfig
,
)
->
dict
[
str
,
Any
]:
)
->
dict
[
str
,
Any
]:
query_start_loc_gpu
=
query_start_loc
.
gpu
[:
num_reqs
+
1
]
max_query_len
=
int
(
query_start_loc_cpu
.
max
())
query_start_loc_cpu
=
query_start_loc
.
cpu
[:
num_reqs
+
1
]
max_query_len
=
int
(
query_start_loc
.
np
[:
num_reqs
+
1
].
max
())
seq_lens
=
seq_lens
[:
num_reqs
]
seq_lens
=
seq_lens
[:
num_reqs
]
seq_lens_cpu
=
torch
.
from_numpy
(
seq_lens_np
)
seq_lens_cpu
=
torch
.
from_numpy
(
seq_lens_np
)
max_seq_len
=
int
(
seq_lens_np
.
max
())
max_seq_len
=
int
(
seq_lens_np
.
max
())
...
...
vllm/v1/worker/gpu/cudagraph_utils.py
View file @
ee80aee1
...
@@ -120,7 +120,8 @@ class CudaGraphManager:
...
@@ -120,7 +120,8 @@ class CudaGraphManager:
attn_metadata_builders
=
attn_metadata_builders
,
attn_metadata_builders
=
attn_metadata_builders
,
num_reqs
=
batch_size
,
num_reqs
=
batch_size
,
num_tokens
=
batch_size
,
num_tokens
=
batch_size
,
query_start_loc
=
input_buffers
.
query_start_loc
,
query_start_loc_gpu
=
input_buffers
.
query_start_loc
.
gpu
[:
batch_size
+
1
],
query_start_loc_cpu
=
input_buffers
.
query_start_loc
.
cpu
[:
batch_size
+
1
],
seq_lens
=
input_buffers
.
seq_lens
,
seq_lens
=
input_buffers
.
seq_lens
,
seq_lens_np
=
np
.
full
(
batch_size
,
self
.
max_model_len
,
dtype
=
np
.
int32
),
seq_lens_np
=
np
.
full
(
batch_size
,
self
.
max_model_len
,
dtype
=
np
.
int32
),
num_computed_tokens_cpu
=
None
,
# FIXME
num_computed_tokens_cpu
=
None
,
# FIXME
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
ee80aee1
...
@@ -226,11 +226,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -226,11 +226,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
num_computed_tokens
=
torch
.
zeros
(
num_computed_tokens
=
torch
.
zeros
(
input_batch
.
num_reqs
,
dtype
=
torch
.
int32
,
device
=
self
.
device
input_batch
.
num_reqs
,
dtype
=
torch
.
int32
,
device
=
self
.
device
)
)
query_start_loc
=
self
.
input_buffers
.
query_start_loc
query_start_loc_gpu
=
query_start_loc
.
gpu
[:
input_batch
.
num_reqs
+
1
]
query_start_loc_cpu
=
query_start_loc
.
cpu
[:
input_batch
.
num_reqs
+
1
]
attn_metadata
=
build_attn_metadata
(
attn_metadata
=
build_attn_metadata
(
attn_metadata_builders
=
self
.
attn_metadata_builders
,
attn_metadata_builders
=
self
.
attn_metadata_builders
,
num_reqs
=
input_batch
.
num_reqs
,
num_reqs
=
input_batch
.
num_reqs
,
num_tokens
=
input_batch
.
num_tokens
,
num_tokens
=
input_batch
.
num_tokens
,
query_start_loc
=
self
.
input_buffers
.
query_start_loc
,
query_start_loc_gpu
=
query_start_loc_gpu
,
query_start_loc_cpu
=
query_start_loc_cpu
,
seq_lens
=
self
.
input_buffers
.
seq_lens
,
seq_lens
=
self
.
input_buffers
.
seq_lens
,
seq_lens_np
=
input_batch
.
seq_lens_np
,
seq_lens_np
=
input_batch
.
seq_lens_np
,
num_computed_tokens_cpu
=
num_computed_tokens
,
num_computed_tokens_cpu
=
num_computed_tokens
,
...
@@ -515,6 +519,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -515,6 +519,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self
.
input_buffers
.
query_start_loc
.
np
[
num_reqs
+
1
:]
=
num_tokens
self
.
input_buffers
.
query_start_loc
.
np
[
num_reqs
+
1
:]
=
num_tokens
self
.
input_buffers
.
query_start_loc
.
copy_to_gpu
()
self
.
input_buffers
.
query_start_loc
.
copy_to_gpu
()
query_start_loc_gpu
=
self
.
input_buffers
.
query_start_loc
.
gpu
[:
num_reqs
+
1
]
query_start_loc_gpu
=
self
.
input_buffers
.
query_start_loc
.
gpu
[:
num_reqs
+
1
]
query_start_loc_cpu
=
self
.
input_buffers
.
query_start_loc
.
cpu
[:
num_reqs
+
1
]
query_start_loc_np
=
self
.
input_buffers
.
query_start_loc
.
np
[:
num_reqs
+
1
]
query_start_loc_np
=
self
.
input_buffers
.
query_start_loc
.
np
[:
num_reqs
+
1
]
# Copy prefill tokens from CPU to GPU.
# Copy prefill tokens from CPU to GPU.
...
@@ -572,7 +577,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -572,7 +577,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
attn_metadata_builders
=
self
.
attn_metadata_builders
,
attn_metadata_builders
=
self
.
attn_metadata_builders
,
num_reqs
=
num_reqs
,
num_reqs
=
num_reqs
,
num_tokens
=
num_tokens
,
num_tokens
=
num_tokens
,
query_start_loc
=
self
.
input_buffers
.
query_start_loc
,
query_start_loc_gpu
=
query_start_loc_gpu
,
query_start_loc_cpu
=
query_start_loc_cpu
,
seq_lens
=
self
.
input_buffers
.
seq_lens
,
seq_lens
=
self
.
input_buffers
.
seq_lens
,
seq_lens_np
=
seq_lens_np
,
seq_lens_np
=
seq_lens_np
,
num_computed_tokens_cpu
=
num_computed_tokens
,
num_computed_tokens_cpu
=
num_computed_tokens
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment