Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a1d3866d
Unverified
Commit
a1d3866d
authored
Nov 12, 2025
by
Jialin Ouyang
Committed by
GitHub
Nov 13, 2025
Browse files
[n-gen] DO NOT repeatedly return finished child requests (#28591)
Signed-off-by:
Jialin Ouyang
<
Jialin.Ouyang@gmail.com
>
parent
97d1c993
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
115 additions
and
3 deletions
+115
-3
tests/v1/engine/test_parallel_sampling.py
tests/v1/engine/test_parallel_sampling.py
+103
-0
vllm/v1/engine/parallel_sampling.py
vllm/v1/engine/parallel_sampling.py
+12
-3
No files found.
tests/v1/engine/test_parallel_sampling.py
0 → 100644
View file @
a1d3866d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
SamplingParams
from
vllm.outputs
import
CompletionOutput
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.v1.engine.parallel_sampling
import
ParentRequest
def
test_parent_request_to_output_stream
()
->
None
:
parent_request
=
ParentRequest
(
"parent_id"
,
SamplingParams
(
n
=
2
))
parent_request
.
child_requests
=
{
"child_id_0"
,
"child_id_1"
}
output_0
=
CompletionOutput
(
index
=
0
,
text
=
"child 0"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
)
output_1
=
CompletionOutput
(
index
=
1
,
text
=
"child 1"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
)
# Request not finished
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
(
"parent_id"
,
[
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
(
"parent_id"
,
[
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
# output_1 finished
output_1
.
finish_reason
=
"ended"
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
(
"parent_id"
,
[
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
# Finished output_1 had already returned, DO NOT returned again
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
False
,
)
# output_0 finished
output_0
.
finish_reason
=
"ended"
assert
(
"parent_id"
,
[
output_0
],
True
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
True
)
# Finished output_0 had already returned, DO NOT returned again
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
(
"parent_id"
,
[],
True
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
True
)
def
test_parent_request_to_output_final_only
()
->
None
:
parent_request
=
ParentRequest
(
"parent_id"
,
SamplingParams
(
n
=
2
,
output_kind
=
RequestOutputKind
.
FINAL_ONLY
)
)
parent_request
.
child_requests
=
{
"child_id_0"
,
"child_id_1"
}
output_0
=
CompletionOutput
(
index
=
0
,
text
=
"child 0"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
)
output_1
=
CompletionOutput
(
index
=
1
,
text
=
"child 1"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
)
# Request not finished, return nothing
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
(
"parent_id"
,
[],
False
,
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
False
,
)
# output_1 finished, but outputs won't be returned until all child requests finished
output_1
.
finish_reason
=
"ended"
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
(
"parent_id"
,
[],
False
,
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
False
,
)
# output_0 finished, as all child requests finished, the output would be returned
output_0
.
finish_reason
=
"ended"
assert
(
"parent_id"
,
[
output_0
,
output_1
],
True
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
(
"parent_id"
,
[
output_0
,
output_1
],
True
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
vllm/v1/engine/parallel_sampling.py
View file @
a1d3866d
...
...
@@ -97,12 +97,21 @@ class ParentRequest:
child_request_id
:
str
,
completion_output
:
CompletionOutput
,
)
->
tuple
[
str
,
list
[
CompletionOutput
],
bool
]:
already_finished_and_returned
:
bool
=
False
if
completion_output
.
finished
():
if
child_request_id
in
self
.
child_requests
:
self
.
child_requests
.
remove
(
child_request_id
)
else
:
# child request ID is not available in child_requests
# which means the request had finished in previous
# batch step and returned to the client earlier
already_finished_and_returned
=
True
if
self
.
sampling_params
.
output_kind
!=
RequestOutputKind
.
FINAL_ONLY
:
# If streaming, just return the current output.
outputs
=
[
completion_output
]
# If streaming, just return the current output
#
# DO NOT output finished and already returned child request to client again
outputs
=
[]
if
already_finished_and_returned
else
[
completion_output
]
else
:
# If not streaming, aggregate the n final outputs.
self
.
output_aggregator
[
completion_output
.
index
]
=
completion_output
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment