Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
28e07508
Unverified
Commit
28e07508
authored
Jan 26, 2025
by
Woosuk Kwon
Committed by
GitHub
Jan 26, 2025
Browse files
[V1] Avoid list creation in input preparation (#12457)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
582cf787
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
5 deletions
+12
-5
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+12
-5
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
28e07508
...
...
@@ -171,7 +171,8 @@ class GPUModelRunner:
# OPTIMIZATION: Cache the tensors rather than creating them every step.
self
.
arange_np
=
np
.
arange
(
max
(
self
.
max_num_reqs
+
1
,
self
.
max_model_len
),
self
.
max_model_len
,
self
.
max_num_tokens
),
dtype
=
np
.
int32
)
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
# a faster version of creating a new tensor every time. Thus, we should
...
...
@@ -358,8 +359,15 @@ class GPUModelRunner:
# Get batched arange.
# E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
arange
=
np
.
concatenate
(
[
self
.
arange_np
[:
n
]
for
n
in
num_scheduled_tokens
])
# Equivalent to but faster than:
# np.concatenate([np.arange(n) for n in num_scheduled_tokens])
# Step 1. [2, 5, 3] -> [2, 7, 10]
cu_num_tokens
=
np
.
cumsum
(
num_scheduled_tokens
)
# Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
cumsums_offsets
=
np
.
repeat
(
cu_num_tokens
-
num_scheduled_tokens
,
num_scheduled_tokens
)
# Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
arange
=
self
.
arange_np
[:
total_num_scheduled_tokens
]
-
cumsums_offsets
# Get positions.
positions_np
=
self
.
positions_np
[:
total_num_scheduled_tokens
]
...
...
@@ -406,8 +414,7 @@ class GPUModelRunner:
# Prepare the attention metadata.
self
.
query_start_loc_np
[
0
]
=
0
np
.
cumsum
(
num_scheduled_tokens
,
out
=
self
.
query_start_loc_np
[
1
:
num_reqs
+
1
])
self
.
query_start_loc_np
[
1
:
num_reqs
+
1
]
=
cu_num_tokens
self
.
seq_lens_np
[:
num_reqs
]
=
(
self
.
input_batch
.
num_computed_tokens_cpu
[:
num_reqs
]
+
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment