Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4e824d1c
Unverified
Commit
4e824d1c
authored
Mar 24, 2026
by
Nick Hill
Committed by
GitHub
Mar 24, 2026
Browse files
[Model Runner V2][Minor] Simplify PP logic (#38031)
Signed-off-by:
Nick Hill
<
nickhill123@gmail.com
>
parent
0c1809c8
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
23 additions
and
28 deletions
+23
-28
vllm/sequence.py
vllm/sequence.py
+9
-0
vllm/v1/worker/gpu/cudagraph_utils.py
vllm/v1/worker/gpu/cudagraph_utils.py
+5
-13
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+9
-15
No files found.
vllm/sequence.py
View file @
4e824d1c
...
@@ -62,3 +62,12 @@ class IntermediateTensors:
...
@@ -62,3 +62,12 @@ class IntermediateTensors:
def
__repr__
(
self
)
->
str
:
def
__repr__
(
self
)
->
str
:
return
f
"IntermediateTensors(tensors=
{
self
.
tensors
}
)"
return
f
"IntermediateTensors(tensors=
{
self
.
tensors
}
)"
@
staticmethod
def
empty_like
(
intermediate_tensors
:
"IntermediateTensors"
,
)
->
"IntermediateTensors"
:
tensors
=
{
k
:
torch
.
empty_like
(
v
)
for
k
,
v
in
intermediate_tensors
.
tensors
.
items
()
}
return
IntermediateTensors
(
tensors
)
vllm/v1/worker/gpu/cudagraph_utils.py
View file @
4e824d1c
...
@@ -94,13 +94,8 @@ class CudaGraphManager:
...
@@ -94,13 +94,8 @@ class CudaGraphManager:
self
.
decode_query_len
=
decode_query_len
self
.
decode_query_len
=
decode_query_len
self
.
dp_size
=
vllm_config
.
parallel_config
.
data_parallel_size
self
.
dp_size
=
vllm_config
.
parallel_config
.
data_parallel_size
self
.
pp_size
=
vllm_config
.
parallel_config
.
pipeline_parallel_size
if
self
.
pp_size
>
1
:
self
.
is_first_pp_rank
=
get_pp_group
().
is_first_rank
self
.
is_first_pp_rank
=
get_pp_group
().
is_first_rank
self
.
is_last_pp_rank
=
get_pp_group
().
is_last_rank
self
.
is_last_pp_rank
=
get_pp_group
().
is_last_rank
else
:
self
.
is_first_pp_rank
=
True
self
.
is_last_pp_rank
=
True
self
.
graphs
:
dict
[
BatchExecutionDescriptor
,
torch
.
cuda
.
CUDAGraph
]
=
{}
self
.
graphs
:
dict
[
BatchExecutionDescriptor
,
torch
.
cuda
.
CUDAGraph
]
=
{}
self
.
pool
=
current_platform
.
get_global_graph_pool
()
if
cudagraph_mode
else
None
self
.
pool
=
current_platform
.
get_global_graph_pool
()
if
cudagraph_mode
else
None
...
@@ -371,14 +366,11 @@ class ModelCudaGraphManager(CudaGraphManager):
...
@@ -371,14 +366,11 @@ class ModelCudaGraphManager(CudaGraphManager):
self
.
aux_hidden_states
[
i
][:
num_tokens
]
=
aux
self
.
aux_hidden_states
[
i
][:
num_tokens
]
=
aux
else
:
else
:
# Non-last PP rank.
# Non-last PP rank.
assert
isinstance
(
model_output
,
IntermediateTensors
)
intermediate_tensors
=
model_output
intermediate_tensors
=
model_output
assert
isinstance
(
intermediate_tensors
,
IntermediateTensors
)
if
self
.
intermediate_tensors
is
None
:
if
self
.
intermediate_tensors
is
None
:
self
.
intermediate_tensors
=
IntermediateTensors
(
self
.
intermediate_tensors
=
IntermediateTensors
.
empty_like
(
{
intermediate_tensors
k
:
torch
.
empty_like
(
v
)
for
k
,
v
in
intermediate_tensors
.
tensors
.
items
()
}
)
)
for
k
,
v
in
intermediate_tensors
.
tensors
.
items
():
for
k
,
v
in
intermediate_tensors
.
tensors
.
items
():
self
.
intermediate_tensors
[
k
][:
num_tokens
]
=
v
self
.
intermediate_tensors
[
k
][:
num_tokens
]
=
v
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
4e824d1c
...
@@ -132,14 +132,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -132,14 +132,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self
.
output_copy_event
=
torch
.
cuda
.
Event
()
self
.
output_copy_event
=
torch
.
cuda
.
Event
()
# Pipeline parallelism.
# Pipeline parallelism.
self
.
pp_size
=
self
.
parallel_config
.
pipeline_parallel_size
self
.
use_pp
=
self
.
parallel_config
.
pipeline_parallel_size
>
1
self
.
use_pp
=
self
.
pp_size
>
1
if
self
.
use_pp
:
self
.
is_first_pp_rank
=
get_pp_group
().
is_first_rank
self
.
is_first_pp_rank
=
get_pp_group
().
is_first_rank
self
.
is_last_pp_rank
=
get_pp_group
().
is_last_rank
self
.
is_last_pp_rank
=
get_pp_group
().
is_last_rank
else
:
self
.
is_first_pp_rank
=
True
self
.
is_last_pp_rank
=
True
# Persistent buffer for intermediate tensors (non-first PP ranks).
# Persistent buffer for intermediate tensors (non-first PP ranks).
self
.
intermediate_tensors
:
IntermediateTensors
|
None
=
None
self
.
intermediate_tensors
:
IntermediateTensors
|
None
=
None
...
@@ -179,7 +175,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -179,7 +175,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if
self
.
speculative_config
.
method
==
"eagle3"
:
if
self
.
speculative_config
.
method
==
"eagle3"
:
# EAGLE3 may require auxiliary hidden states from target model outputs.
# EAGLE3 may require auxiliary hidden states from target model outputs.
self
.
use_aux_hidden_state_outputs
=
True
self
.
use_aux_hidden_state_outputs
=
True
if
self
.
pp_size
>
1
:
if
self
.
use_pp
:
raise
ValueError
(
"EAGLE3 with pipeline parallel is not supported."
)
raise
ValueError
(
"EAGLE3 with pipeline parallel is not supported."
)
# Draft tokens propagation - for spec-dec + struct outputs.
# Draft tokens propagation - for spec-dec + struct outputs.
...
@@ -270,8 +266,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -270,8 +266,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
logger
.
info
(
"Loading model from scratch..."
)
logger
.
info
(
"Loading model from scratch..."
)
self
.
model
=
model_loader
.
load_model
(
self
.
model
=
model_loader
.
load_model
(
vllm_config
=
self
.
vllm_config
,
vllm_config
=
self
.
vllm_config
,
model_config
=
self
.
vllm_config
.
model_config
model_config
=
self
.
vllm_config
.
model_config
,
)
)
if
self
.
lora_config
:
if
self
.
lora_config
:
self
.
model
=
self
.
load_lora_model
(
self
.
model
=
self
.
load_lora_model
(
...
@@ -1026,14 +1021,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1026,14 +1021,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
assert
intermediate_tensors
is
not
None
assert
intermediate_tensors
is
not
None
assert
self
.
intermediate_tensors
is
not
None
assert
self
.
intermediate_tensors
is
not
None
n
=
input_batch
.
num_tokens_after_padding
n
=
input_batch
.
num_tokens_after_padding
intermediate_tensors
=
IntermediateTensors
(
model_inputs
[
"
intermediate_tensors
"
]
=
IntermediateTensors
(
{
{
k
:
v
[:
n
].
copy_
(
intermediate_tensors
.
tensors
[
k
][:
n
])
k
:
v
[:
n
].
copy_
(
intermediate_tensors
.
tensors
[
k
][:
n
])
for
k
,
v
in
self
.
intermediate_tensors
.
tensors
.
items
()
for
k
,
v
in
self
.
intermediate_tensors
.
tensors
.
items
()
},
}
intermediate_tensors
.
kv_connector_output
,
)
)
mo
del
_inputs
[
"intermediate_tensors"
]
=
intermediate_tensors
del
intermediate_tensors
# Run model.
# Run model.
if
batch_desc
.
cg_mode
==
CUDAGraphMode
.
FULL
:
if
batch_desc
.
cg_mode
==
CUDAGraphMode
.
FULL
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment