Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b3546865
Unverified
Commit
b3546865
authored
Mar 06, 2026
by
Nick Hill
Committed by
GitHub
Mar 06, 2026
Browse files
[Model Runner V2] Fix warmup for pipeline parallel (#36280)
Signed-off-by:
Nick Hill
<
nickhill123@gmail.com
>
parent
6a18d878
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
16 additions
and
8 deletions
+16
-8
vllm/v1/worker/gpu/warmup.py
vllm/v1/worker/gpu/warmup.py
+15
-7
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+1
-1
No files found.
vllm/v1/worker/gpu/warmup.py
View file @
b3546865
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Callable
from
typing
import
Any
import
numpy
as
np
import
torch
...
...
@@ -17,9 +20,14 @@ from vllm.v1.worker.gpu.model_runner import GPUModelRunner
@
torch
.
inference_mode
()
def
warmup_kernels
(
model_runner
:
GPUModelRunner
)
->
None
:
def
warmup_kernels
(
model_runner
:
GPUModelRunner
,
worker_execute_model
:
Callable
[[
SchedulerOutput
],
Any
],
worker_sample_tokens
:
Callable
[[
GrammarOutput
|
None
],
Any
],
)
->
None
:
"""Run two execute_model + sample_tokens iterations to JIT compile
triton kernels.
triton kernels. We must call the provided worker's execute_model for
pipeline parallel coordination.
The first iteration simulates a prefill with requests of 2 prompt
tokens each. The second iteration simulates a decode step with all
...
...
@@ -83,7 +91,7 @@ def warmup_kernels(model_runner: GPUModelRunner) -> None:
# Disable KV connector for warmup run.
model_runner
.
kv_connector
.
set_disabled
(
True
)
model_runn
er
.
execute_model
(
prefill_output
)
work
er
_
execute_model
(
prefill_output
)
if
not
model_runner
.
is_pooling_model
:
# Warm up sampler and perform a decode step for non-pooling models.
...
...
@@ -101,7 +109,7 @@ def warmup_kernels(model_runner: GPUModelRunner) -> None:
structured_output_request_ids
=
req_ids
,
grammar_bitmask
=
grammar_bitmask
)
model_runn
er
.
sample_tokens
(
grammar_output
)
work
er
_
sample_tokens
(
grammar_output
)
# Step 2: Decode all requests with 1 token each.
cached_req_data
=
CachedRequestData
.
make_empty
()
...
...
@@ -120,12 +128,12 @@ def warmup_kernels(model_runner: GPUModelRunner) -> None:
decode_output
.
total_num_scheduled_tokens
=
num_reqs
decode_output
.
num_common_prefix_blocks
=
[
0
]
*
num_kv_cache_groups
model_runn
er
.
execute_model
(
decode_output
)
model_runn
er
.
sample_tokens
(
None
)
work
er
_
execute_model
(
decode_output
)
work
er
_
sample_tokens
(
None
)
# Clean up - process finish_req_ids.
cleanup_output
=
SchedulerOutput
.
make_empty
()
cleanup_output
.
finished_req_ids
=
set
(
req_ids
)
model_runn
er
.
execute_model
(
cleanup_output
)
work
er
_
execute_model
(
cleanup_output
)
model_runner
.
kv_connector
.
set_disabled
(
False
)
torch
.
accelerator
.
synchronize
()
vllm/v1/worker/gpu_worker.py
View file @
b3546865
...
...
@@ -584,7 +584,7 @@ class Worker(WorkerBase):
if
self
.
use_v2_model_runner
:
# V2: Run full execute_model + sample_tokens to JIT compile triton kernels.
warmup_kernels
(
self
.
model_runner
)
warmup_kernels
(
self
.
model_runner
,
self
.
execute_model
,
self
.
sample_tokens
)
elif
get_pp_group
().
is_last_rank
:
# V1: Warm up sampler and preallocate memory buffer for logits and other
# sampling related tensors of max possible shape to avoid memory
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment