Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3319a493
Unverified
Commit
3319a493
authored
Nov 19, 2025
by
Jialin Ouyang
Committed by
GitHub
Nov 19, 2025
Browse files
[Core] Reuse created spec tokens lists to mitigate GC cost (#28917)
Signed-off-by:
Jialin Ouyang
<
Jialin.Ouyang@gmail.com
>
parent
61728cd1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
7 deletions
+14
-7
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+12
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-1
No files found.
vllm/v1/worker/gpu_input_batch.py
View file @
3319a493
...
@@ -251,7 +251,7 @@ class InputBatch:
...
@@ -251,7 +251,7 @@ class InputBatch:
self
.
logitsprocs_need_output_token_ids
=
logitsprocs_need_output_token_ids
self
.
logitsprocs_need_output_token_ids
=
logitsprocs_need_output_token_ids
# Store last speculative tokens for sampler.
# Store last speculative tokens for sampler.
self
.
spec_token_ids
:
list
[
list
[
int
]
|
None
]
=
[
]
self
.
spec_token_ids
:
list
[
list
[
int
]
]
=
[[]
for
_
in
range
(
max_num_reqs
)
]
# This is updated each time the batch constituents change.
# This is updated each time the batch constituents change.
self
.
sampling_metadata
=
self
.
_make_sampling_metadata
()
self
.
sampling_metadata
=
self
.
_make_sampling_metadata
()
...
@@ -313,7 +313,7 @@ class InputBatch:
...
@@ -313,7 +313,7 @@ class InputBatch:
else
:
else
:
self
.
_req_ids
[
req_index
]
=
req_id
self
.
_req_ids
[
req_index
]
=
req_id
self
.
req_output_token_ids
[
req_index
]
=
request
.
output_token_ids
self
.
req_output_token_ids
[
req_index
]
=
request
.
output_token_ids
self
.
spec_token_ids
[
req_index
]
=
[]
self
.
spec_token_ids
[
req_index
]
.
clear
()
self
.
req_id_to_index
[
req_id
]
=
req_index
self
.
req_id_to_index
[
req_id
]
=
req_index
...
@@ -462,7 +462,7 @@ class InputBatch:
...
@@ -462,7 +462,7 @@ class InputBatch:
self
.
batch_update_builder
.
removed_append
(
req_index
)
self
.
batch_update_builder
.
removed_append
(
req_index
)
self
.
_req_ids
[
req_index
]
=
None
self
.
_req_ids
[
req_index
]
=
None
self
.
req_output_token_ids
[
req_index
]
=
None
self
.
req_output_token_ids
[
req_index
]
=
None
self
.
spec_token_ids
[
req_index
]
=
None
self
.
spec_token_ids
[
req_index
]
.
clear
()
# LoRA
# LoRA
lora_id
=
self
.
request_lora_mapping
[
req_index
]
lora_id
=
self
.
request_lora_mapping
[
req_index
]
...
@@ -654,9 +654,15 @@ class InputBatch:
...
@@ -654,9 +654,15 @@ class InputBatch:
self
.
req_output_token_ids
[
last_req_index
]
=
None
self
.
req_output_token_ids
[
last_req_index
]
=
None
self
.
req_id_to_index
[
req_id
]
=
empty_index
self
.
req_id_to_index
[
req_id
]
=
empty_index
spec_token_ids
=
self
.
spec_token_ids
[
last_req_index
]
if
last_req_index
!=
empty_index
:
self
.
spec_token_ids
[
empty_index
]
=
spec_token_ids
(
self
.
spec_token_ids
[
last_req_index
]
=
None
self
.
spec_token_ids
[
last_req_index
],
self
.
spec_token_ids
[
empty_index
],
)
=
(
self
.
spec_token_ids
[
empty_index
],
self
.
spec_token_ids
[
last_req_index
],
)
self
.
spec_token_ids
[
last_req_index
].
clear
()
num_tokens
=
self
.
num_tokens
[
last_req_index
]
num_tokens
=
self
.
num_tokens
[
last_req_index
]
self
.
token_ids_cpu
[
empty_index
,
:
num_tokens
]
=
self
.
token_ids_cpu
[
self
.
token_ids_cpu
[
empty_index
,
:
num_tokens
]
=
self
.
token_ids_cpu
[
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
3319a493
...
@@ -892,7 +892,8 @@ class GPUModelRunner(
...
@@ -892,7 +892,8 @@ class GPUModelRunner(
# conform to the schema. This can result in
# conform to the schema. This can result in
# scheduler_output.scheduled_spec_decode_tokens being empty,
# scheduler_output.scheduled_spec_decode_tokens being empty,
# even when speculative decoding is enabled.
# even when speculative decoding is enabled.
self
.
input_batch
.
spec_token_ids
[
req_index
]
=
spec_token_ids
self
.
input_batch
.
spec_token_ids
[
req_index
].
clear
()
self
.
input_batch
.
spec_token_ids
[
req_index
].
extend
(
spec_token_ids
)
# there are no draft tokens with async scheduling,
# there are no draft tokens with async scheduling,
# we clear the spec_decoding info in scheduler_output and
# we clear the spec_decoding info in scheduler_output and
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment