Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
30b9c677
Unverified
Commit
30b9c677
authored
Nov 20, 2025
by
Jialin Ouyang
Committed by
GitHub
Nov 20, 2025
Browse files
Revert "[Redo] #26368 (#28771)" (#29121)
Signed-off-by:
Jialin Ouyang
<
Jialin.Ouyang@gmail.com
>
parent
11857a00
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
99 additions
and
127 deletions
+99
-127
tests/v1/core/test_async_scheduler.py
tests/v1/core/test_async_scheduler.py
+1
-2
tests/v1/core/test_priority_scheduler_random.py
tests/v1/core/test_priority_scheduler_random.py
+2
-4
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+38
-50
tests/v1/kv_connector/unit/test_nixl_connector.py
tests/v1/kv_connector/unit/test_nixl_connector.py
+3
-4
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+1
-2
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+1
-4
tests/v1/spec_decode/test_ngram.py
tests/v1/spec_decode/test_ngram.py
+9
-9
vllm/utils/gc_utils.py
vllm/utils/gc_utils.py
+8
-5
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+2
-2
vllm/v1/outputs.py
vllm/v1/outputs.py
+2
-2
vllm/v1/sample/rejection_sampler.py
vllm/v1/sample/rejection_sampler.py
+5
-3
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+4
-3
vllm/v1/spec_decode/ngram_proposer.py
vllm/v1/spec_decode/ngram_proposer.py
+3
-3
vllm/v1/spec_decode/suffix_decoding.py
vllm/v1/spec_decode/suffix_decoding.py
+4
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+13
-23
vllm/v1/worker/tpu_model_runner.py
vllm/v1/worker/tpu_model_runner.py
+3
-5
No files found.
tests/v1/core/test_async_scheduler.py
View file @
30b9c677
...
...
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections
import
deque
import
numpy
as
np
import
pytest
from
vllm.v1.core.sched.output
import
SchedulerOutput
...
...
@@ -22,7 +21,7 @@ def _make_model_runner_output(
return
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
{
req_id
:
i
for
i
,
req_id
in
enumerate
(
req_ids
)},
sampled_token_ids
=
[
np
.
array
(
[
i
]
)
for
i
in
range
(
len
(
req_ids
))],
sampled_token_ids
=
[[
i
]
for
i
in
range
(
len
(
req_ids
))],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
tests/v1/core/test_priority_scheduler_random.py
View file @
30b9c677
...
...
@@ -3,7 +3,6 @@
import
random
import
uuid
import
numpy
as
np
import
pytest
from
vllm.config
import
VllmConfig
...
...
@@ -100,7 +99,8 @@ def _mock_execute_model(
random
.
randint
(
*
num_output_tokens_range
)
for
_
in
range
(
len
(
request_ids
))
]
sampled_token_ids
=
[
np
.
random
.
randint
(
0
,
100
,
size
=
num_tokens
)
for
num_tokens
in
num_output_tokens
[
random
.
randint
(
0
,
100
)
for
_
in
range
(
num_tokens
)]
for
num_tokens
in
num_output_tokens
]
return
ModelRunnerOutput
(
...
...
@@ -196,8 +196,6 @@ def test_priority_scheduling_blast(
num_blocks
:
int
,
):
random
.
seed
(
42
)
np
.
random
.
seed
(
42
)
seen_request_prompt_length
=
dict
[
str
,
int
]()
seen_request_ids
=
set
[
str
]()
seen_mm_hashes
=
set
[
str
]()
...
...
tests/v1/core/test_scheduler.py
View file @
30b9c677
...
...
@@ -3,7 +3,6 @@
import
dataclasses
from
unittest.mock
import
Mock
import
numpy
as
np
import
pytest
import
torch
...
...
@@ -170,7 +169,7 @@ def test_schedule_partial_requests():
req_id_to_index
=
req_to_index
,
# Only the first request has a sampled token id because
# the rest requests are still being prefilled.
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
,
np
.
array
(
[]
)
,
np
.
array
(
[]
)
],
sampled_token_ids
=
[[
0
],
[],
[]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -217,7 +216,7 @@ def test_no_mm_input_chunking():
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[]
)
for
_
in
range
(
len
(
requests
))],
sampled_token_ids
=
[[]
for
_
in
range
(
len
(
requests
))],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -277,7 +276,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[]
)
for
_
in
range
(
len
(
requests
))],
sampled_token_ids
=
[[]
for
_
in
range
(
len
(
requests
))],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -301,8 +300,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
([
0
]),
np
.
array
([
0
])]
+
[
np
.
array
([])
for
_
in
range
(
len
(
requests
)
-
2
)],
sampled_token_ids
=
[[
0
],
[
0
]]
+
[[]
for
_
in
range
(
len
(
requests
)
-
2
)],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -349,8 +347,8 @@ def test_stop_via_update_from_output():
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
EOS_TOKEN_ID
]
)
,
np
.
array
(
[
10
,
11
]
)
,
[
EOS_TOKEN_ID
],
[
10
,
11
],
],
# First request hits EOS, second continues
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -394,10 +392,7 @@ def test_stop_via_update_from_output():
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
([
10
,
42
,
12
]),
np
.
array
([
13
,
14
]),
],
# First request hits stop token
sampled_token_ids
=
[[
10
,
42
,
12
],
[
13
,
14
]],
# First request hits stop token
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -441,10 +436,7 @@ def test_stop_via_update_from_output():
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
([
10
,
11
,
12
]),
np
.
array
([
13
]),
],
# First request exceeds max_tokens
sampled_token_ids
=
[[
10
,
11
,
12
],
[
13
]],
# First request exceeds max_tokens
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -483,7 +475,7 @@ def test_stop_via_update_from_output():
model_output
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
0
].
request_id
],
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
EOS_TOKEN_ID
,
10
,
11
]
)
],
sampled_token_ids
=
[[
EOS_TOKEN_ID
,
10
,
11
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -624,7 +616,7 @@ def test_schedule_concurrent_batches(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
0
].
request_id
],
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -641,7 +633,7 @@ def test_schedule_concurrent_batches(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
1
].
request_id
],
req_id_to_index
=
{
requests
[
1
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -678,7 +670,7 @@ def test_preempt_during_execution():
model_runner_output0
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
0
].
request_id
],
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -695,7 +687,7 @@ def test_preempt_during_execution():
model_runner_output1
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
1
].
request_id
],
req_id_to_index
=
{
requests
[
1
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
42
]
)
],
sampled_token_ids
=
[[
42
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -712,18 +704,14 @@ def test_preempt_during_execution():
@
pytest
.
mark
.
parametrize
(
"spec_tokens,output_tokens,expected"
,
[
([[
1
,
2
,
3
]],
[
np
.
array
([
1
,
2
,
3
,
4
])],
(
1
,
3
,
3
,
[
1
,
1
,
1
])),
# perfect match
([[
1
,
2
,
3
]],
[
np
.
array
([
1
,
5
])],
(
1
,
3
,
1
,
[
1
,
0
,
0
])),
# early mismatch
(
[[
1
,
2
],
[
3
]],
[
np
.
array
([
1
,
2
,
5
]),
np
.
array
([
3
,
4
])],
(
2
,
3
,
3
,
[
2
,
1
]),
),
# multiple sequences
([[
1
]],
[
np
.
array
([
1
,
2
])],
(
1
,
1
,
1
,
[
1
])),
# single token sequence
([[]],
[
np
.
array
([
5
])],
(
0
,
0
,
0
,
[
0
])),
# empty sequence
([[
1
,
2
,
3
]],
[[
1
,
2
,
3
,
4
]],
(
1
,
3
,
3
,
[
1
,
1
,
1
])),
# perfect match
([[
1
,
2
,
3
]],
[[
1
,
5
]],
(
1
,
3
,
1
,
[
1
,
0
,
0
])),
# early mismatch
([[
1
,
2
],
[
3
]],
[[
1
,
2
,
5
],
[
3
,
4
]],
(
2
,
3
,
3
,
[
2
,
1
])),
# multiple sequences
([[
1
]],
[[
1
,
2
]],
(
1
,
1
,
1
,
[
1
])),
# single token sequence
([[]],
[[
5
]],
(
0
,
0
,
0
,
[
0
])),
# empty sequence
(
[[
1
,
2
,
3
],
[
4
,
5
,
6
]],
[
np
.
array
(
[
1
,
2
,
7
]
)
,
np
.
array
(
[
4
,
8
]
)
],
[[
1
,
2
,
7
],
[
4
,
8
]],
(
2
,
6
,
3
,
[
2
,
1
,
0
]),
),
# multiple mismatches
],
...
...
@@ -757,7 +745,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
for
_
in
range
(
len
(
requests
))],
sampled_token_ids
=
[[
0
]
for
_
in
range
(
len
(
requests
))],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -984,7 +972,7 @@ def test_kv_connector_basic(is_async: bool):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -1037,7 +1025,7 @@ def test_kv_connector_basic(is_async: bool):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -1100,7 +1088,7 @@ def test_external_prefix_cache_metrics():
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
[
r
.
request_id
for
r
in
requests
],
req_id_to_index
=
{
r
.
request_id
:
i
for
i
,
r
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
NUM_REQUESTS
,
sampled_token_ids
=
[[
1000
]]
*
NUM_REQUESTS
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -1166,7 +1154,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -1251,7 +1239,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -1344,7 +1332,7 @@ def make_output(scheduler: Scheduler):
return
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
scheduler
.
running
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
scheduler
.
running
)},
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
scheduler
.
running
),
sampled_token_ids
=
[[
1000
]]
*
len
(
scheduler
.
running
),
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -1761,7 +1749,7 @@ def test_priority_scheduling_preemption():
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
low_priority_requests
)
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
low_priority_requests
],
sampled_token_ids
=
[[
100
]
for
_
in
low_priority_requests
],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -1830,7 +1818,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
low_priority_requests
)
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
low_priority_requests
],
sampled_token_ids
=
[[
100
]
for
_
in
low_priority_requests
],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -2076,7 +2064,7 @@ def test_priority_scheduling_heap_property():
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
req_id
],
req_id_to_index
=
{
req
.
req_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -2162,7 +2150,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request_low
.
request_id
],
req_id_to_index
=
{
request_low
.
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -2193,7 +2181,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
requests
],
sampled_token_ids
=
[[
100
]
for
_
in
requests
],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -2219,7 +2207,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[]
)
,
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[],
[
100
]],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -2636,7 +2624,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request1
.
request_id
],
req_id_to_index
=
{
request1
.
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -2842,7 +2830,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
...
...
@@ -2955,7 +2943,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request_low
.
request_id
],
req_id_to_index
=
{
request_low
.
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -3006,7 +2994,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
requests
],
sampled_token_ids
=
[[
100
]
for
_
in
requests
],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -3041,7 +3029,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
,
np
.
array
(
[
100
,
200
]
)
],
sampled_token_ids
=
[[
100
],
[
100
,
200
]],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -3227,7 +3215,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request1
.
request_id
,
request2
.
request_id
],
req_id_to_index
=
{
request1
.
request_id
:
0
,
request2
.
request_id
:
1
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
,
np
.
array
(
[
121
]
)
],
sampled_token_ids
=
[[
100
],
[
121
]],
# spec_token_ids=None,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
tests/v1/kv_connector/unit/test_nixl_connector.py
View file @
30b9c677
...
...
@@ -11,7 +11,6 @@ import uuid
from
collections
import
defaultdict
from
unittest.mock
import
patch
import
numpy
as
np
import
pytest
import
ray
import
torch
...
...
@@ -827,7 +826,7 @@ def test_kv_connector_stats_aggregation():
output
=
ModelRunnerOutput
(
req_ids
=
[
f
"req_
{
i
}
"
],
req_id_to_index
=
{
f
"req_
{
i
}
"
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
123
]
)
],
# dummy token
sampled_token_ids
=
[[
123
]],
# dummy token
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[
None
],
...
...
@@ -908,7 +907,7 @@ def test_multi_kv_connector_stats_aggregation():
output
=
ModelRunnerOutput
(
req_ids
=
[
f
"req_
{
i
}
"
],
req_id_to_index
=
{
f
"req_
{
i
}
"
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
123
]
)
],
sampled_token_ids
=
[[
123
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[
None
],
...
...
@@ -966,7 +965,7 @@ def test_scheduler_kv_connector_stats_aggregation():
model_output
=
ModelRunnerOutput
(
req_ids
=
[
"req_0"
],
req_id_to_index
=
{
"req_0"
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
123
]
)
],
sampled_token_ids
=
[[
123
]],
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[
None
],
...
...
tests/v1/kv_connector/unit/utils.py
View file @
30b9c677
...
...
@@ -7,7 +7,6 @@ from dataclasses import dataclass
from
itertools
import
chain
,
count
from
typing
import
Any
import
numpy
as
np
import
torch
from
vllm
import
SamplingParams
...
...
@@ -229,7 +228,7 @@ def create_model_runner_output(
# Make sampled tokens.
sampled_token
=
EOS_TOKEN_ID
if
use_eos
else
token_id
sampled_token_ids
=
[
np
.
array
(
[
sampled_token
]
)
for
_
in
req_ids
]
sampled_token_ids
=
[[
sampled_token
]
for
_
in
req_ids
]
kv_connector_output
=
(
None
...
...
tests/v1/spec_decode/test_eagle.py
View file @
30b9c677
...
...
@@ -3,7 +3,6 @@
from
unittest
import
mock
import
numpy
as
np
import
pytest
import
torch
...
...
@@ -113,9 +112,7 @@ def test_prepare_next_token_ids():
sampled_token_ids_tensor
=
torch
.
tensor
(
sampled_token_ids
,
dtype
=
torch
.
int32
,
device
=
device
)
sampled_token_ids_cpu
=
[
np
.
array
([
i
for
i
in
seq
if
i
!=
-
1
])
for
seq
in
sampled_token_ids
]
sampled_token_ids_cpu
=
[[
i
for
i
in
seq
if
i
!=
-
1
]
for
seq
in
sampled_token_ids
]
expected_next_token_ids_cpu
=
[
1
,
4
,
30
,
40
]
expected_next_token_ids_tensor
=
torch
.
tensor
(
...
...
tests/v1/spec_decode/test_ngram.py
View file @
30b9c677
...
...
@@ -77,7 +77,7 @@ def test_ngram_proposer():
# No match.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
5
]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -88,7 +88,7 @@ def test_ngram_proposer():
# No match for 4-gram.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
1
,
2
,
3
]])
result
=
get_ngram_proposer
(
min_n
=
4
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -99,7 +99,7 @@ def test_ngram_proposer():
# No match for 4-gram but match for 3-gram.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
1
,
2
,
3
]])
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -111,7 +111,7 @@ def test_ngram_proposer():
# In this case, the proposer should return the 4-gram match.
token_ids_cpu
=
np
.
array
([[
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
1
,
2
,
3
,
4
]])
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -122,7 +122,7 @@ def test_ngram_proposer():
# Match for 2-gram and 3-gram, but not 4-gram.
token_ids_cpu
=
np
.
array
([[
3
,
4
,
5
,
2
,
3
,
4
,
1
,
2
,
3
,
4
]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -133,7 +133,7 @@ def test_ngram_proposer():
# Multiple 3-gram matched, but always pick the first one.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
100
,
1
,
2
,
3
,
200
,
1
,
2
,
3
,
300
,
1
,
2
,
3
]])
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
3
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -144,7 +144,7 @@ def test_ngram_proposer():
# check empty input
token_ids_cpu
=
np
.
array
([[]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -157,7 +157,7 @@ def test_ngram_proposer():
# second request has 3 tokens and no match. Padded with -1 for max len 5
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
1
,
2
],
[
4
,
5
,
6
,
-
1
,
-
1
]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
,
np
.
array
(
[
1
]
)
],
sampled_token_ids
=
[[
0
],
[
1
]],
req_ids
=
[
"0"
,
"1"
],
num_tokens_no_spec
=
np
.
array
([
5
,
3
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
@@ -181,7 +181,7 @@ def test_ngram_proposer():
input_2
[:
3
]
=
[
4
,
5
,
6
]
token_ids_cpu
=
np
.
array
([
input_1
,
input_2
])
result
=
ngram_proposer
.
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
,
np
.
array
(
[
1
]
)
],
sampled_token_ids
=
[[
0
],
[
1
]],
req_ids
=
[
"0"
,
"1"
],
num_tokens_no_spec
=
np
.
array
([
len
(
input_1
),
3
]),
token_ids_cpu
=
token_ids_cpu
,
...
...
vllm/utils/gc_utils.py
View file @
30b9c677
...
...
@@ -53,6 +53,7 @@ class GCDebugger:
self
.
config
=
config
# Start time in micro second of this GC cycle
self
.
start_time_ns
:
int
=
time
.
monotonic_ns
()
self
.
num_objects
:
int
=
0
# If config.top_objects is positive,
# compute top collected objects by object types
self
.
gc_top_collected_objects
:
str
=
""
...
...
@@ -68,9 +69,10 @@ class GCDebugger:
# Before GC started, record GC start time
# and top collected objects
self
.
start_time_ns
=
time
.
monotonic_ns
()
if
(
top_objects
:
=
self
.
config
.
top_objects
)
>
0
:
objects
=
gc
.
get_objects
(
generation
)
self
.
num_objects
=
len
(
objects
)
self
.
gc_top_collected_objects
=
_compute_top_gc_collected_objects
(
gc
.
get_objects
(
generation
),
top_objects
objects
,
self
.
config
.
top_objects
)
elif
phase
==
"stop"
:
# After GC finished, Record GC elapsed time and
...
...
@@ -78,9 +80,10 @@ class GCDebugger:
elpased_ms
=
(
time
.
monotonic_ns
()
-
self
.
start_time_ns
)
/
1e6
logger
.
info
(
"GC took %.3fms to complete. "
"Collected %s objects in GC generation %d.%s"
,
"Collected %s objects
(out of %d)
in GC generation %d.%s"
,
elpased_ms
,
str
(
info
.
get
(
"collected"
,
"?"
)),
self
.
num_objects
,
generation
,
(
f
" Top collected objects:
\n
{
self
.
gc_top_collected_objects
}
"
...
...
vllm/v1/core/sched/scheduler.py
View file @
30b9c677
...
...
@@ -1013,8 +1013,8 @@ class Scheduler(SchedulerInterface):
continue
req_index
=
model_runner_output
.
req_id_to_index
[
req_id
]
generated_token_ids
:
list
[
int
]
=
(
sampled_token_ids
[
req_index
]
.
tolist
()
if
sampled_token_ids
else
[]
generated_token_ids
=
(
sampled_token_ids
[
req_index
]
if
sampled_token_ids
else
[]
)
scheduled_spec_token_ids
=
(
...
...
vllm/v1/outputs.py
View file @
30b9c677
...
...
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
# num_generated_tokens is the number of tokens
# generated in the current step. It can be different for
# each request due to speculative/jump decoding.
sampled_token_ids
:
list
[
np
.
ndarray
]
sampled_token_ids
:
list
[
list
[
int
]
]
# [num_reqs, max_num_logprobs + 1]
# [num_reqs, max_num_logprobs + 1]
...
...
@@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output(
req_id_to_index
:
dict
[
str
,
int
]
=
{
rid
:
idx
for
idx
,
rid
in
enumerate
(
req_ids
)}
# No tokens generated yet ⇒ one empty list per request
sampled_token_ids
:
list
[
list
[
int
]]
=
[
np
.
array
(
[
0
]
)
for
_
in
req_ids
]
sampled_token_ids
:
list
[
list
[
int
]]
=
[[
0
]
for
_
in
req_ids
]
# Pooler outputs are not available yet ⇒ use None placeholders
pooler_output
:
list
[
torch
.
Tensor
|
None
]
=
[
None
for
_
in
req_ids
]
...
...
vllm/v1/sample/rejection_sampler.py
View file @
30b9c677
...
...
@@ -3,7 +3,6 @@
from
dataclasses
import
replace
import
numpy
as
np
import
torch
import
torch.nn
as
nn
...
...
@@ -205,7 +204,7 @@ class RejectionSampler(nn.Module):
def
parse_output
(
output_token_ids
:
torch
.
Tensor
,
vocab_size
:
int
,
)
->
list
[
np
.
ndarray
]:
)
->
list
[
list
[
int
]
]:
"""Parse the output of the rejection sampler.
Args:
output_token_ids: The sampled token IDs in shape
...
...
@@ -221,7 +220,10 @@ class RejectionSampler(nn.Module):
valid_mask
=
(
output_token_ids_np
!=
PLACEHOLDER_TOKEN_ID
)
&
(
output_token_ids_np
<
vocab_size
)
return
[
row
[
valid_mask
[
i
]]
for
i
,
row
in
enumerate
(
output_token_ids_np
)]
outputs
=
[
row
[
valid_mask
[
i
]].
tolist
()
for
i
,
row
in
enumerate
(
output_token_ids_np
)
]
return
outputs
def
apply_logits_processors
(
self
,
...
...
vllm/v1/spec_decode/eagle.py
View file @
30b9c677
...
...
@@ -496,7 +496,7 @@ class EagleProposer:
def
prepare_next_token_ids_cpu
(
self
,
sampled_token_ids
:
list
[
np
.
ndarray
],
sampled_token_ids
:
list
[
list
[
int
]
],
requests
:
dict
[
str
,
CachedRequestState
],
gpu_input_batch
:
InputBatch
,
num_scheduled_tokens
:
dict
[
str
,
int
],
...
...
@@ -511,7 +511,7 @@ class EagleProposer:
req_ids
=
gpu_input_batch
.
req_ids
next_token_ids
:
list
[
int
]
=
[]
for
i
,
token_ids
in
enumerate
(
sampled_token_ids
):
if
token_ids
.
shape
[
0
]
>
0
:
if
token_ids
:
# Common case.
next_token_id
=
token_ids
[
-
1
]
else
:
...
...
@@ -522,9 +522,10 @@ class EagleProposer:
seq_len
=
req_state
.
num_computed_tokens
+
num_scheduled_tokens
[
req_id
]
next_token_id
=
req_state
.
get_token_id
(
seq_len
)
next_token_ids
.
append
(
next_token_id
)
return
torch
.
tensor
(
next_token_ids
=
torch
.
tensor
(
next_token_ids
,
dtype
=
torch
.
int32
,
device
=
self
.
input_ids
.
device
)
return
next_token_ids
def
prepare_next_token_ids_padded
(
self
,
...
...
vllm/v1/spec_decode/ngram_proposer.py
View file @
30b9c677
...
...
@@ -54,7 +54,7 @@ class NgramProposer:
# Trigger Numba JIT compilation for N-gram proposer.
# This usually takes less than 1 second.
self
.
propose
(
[
np
.
array
(
[]
)
]
*
1024
,
[[]]
*
1024
,
[
""
]
*
1024
,
np
.
zeros
(
1024
,
dtype
=
np
.
int32
),
np
.
zeros
((
1024
,
self
.
max_model_len
),
dtype
=
np
.
int32
),
...
...
@@ -131,7 +131,7 @@ class NgramProposer:
def
propose
(
self
,
sampled_token_ids
:
list
[
np
.
ndarray
],
sampled_token_ids
:
list
[
list
[
int
]
],
req_ids
:
list
[
str
],
num_tokens_no_spec
:
np
.
ndarray
,
token_ids_cpu
:
np
.
ndarray
,
...
...
@@ -140,7 +140,7 @@ class NgramProposer:
# find which requests need ngram proposals
valid_ngram_requests
=
[]
for
i
,
sampled_ids
in
enumerate
(
sampled_token_ids
):
num_sampled_ids
=
sampled_ids
.
shape
[
0
]
num_sampled_ids
=
len
(
sampled_ids
)
if
not
num_sampled_ids
:
# Skip speculative decoding.
continue
...
...
vllm/v1/spec_decode/suffix_decoding.py
View file @
30b9c677
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
numpy
as
np
from
vllm.config
import
VllmConfig
from
vllm.v1.worker.gpu_input_batch
import
InputBatch
...
...
@@ -34,16 +32,16 @@ class SuffixDecodingProposer:
def
propose
(
self
,
input_batch
:
InputBatch
,
sampled_token_ids
:
list
[
np
.
ndarray
],
sampled_token_ids
:
list
[
list
[
int
]
],
)
->
list
[
list
[
int
]]:
"""
Propose speculative tokens for each request in the input batch. Suffix Decoding
will speculate a dynamic number of tokens for each request every decoding step,
so each entry in the returned list may have different lengths.
"""
draft_token_ids
:
list
[
np
.
ndarray
]
=
[]
draft_token_ids
:
list
[
list
[
int
]
]
=
[]
for
i
,
sampled_ids
in
enumerate
(
sampled_token_ids
):
if
sampled_ids
.
shape
[
0
]
==
0
:
if
not
sampled_ids
:
# Skip speculative decoding for partial prefills.
draft_token_ids
.
append
([])
continue
...
...
@@ -72,7 +70,7 @@ class SuffixDecodingProposer:
self
.
suffix_cache
.
start_request
(
req_id
,
prompt_token_ids
)
# Append the newly sampled ids to the suffix cache for this request.
self
.
suffix_cache
.
add_active_response
(
req_id
,
sampled_ids
.
tolist
()
)
self
.
suffix_cache
.
add_active_response
(
req_id
,
sampled_ids
)
# Suffix decoding only uses the most recent tokens up to max_tree_depth, so
# we extract the pattern from the end of the input.
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
30b9c677
...
...
@@ -221,16 +221,14 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
del
self
.
_sampled_token_ids
max_gen_len
=
self
.
sampled_token_ids_cpu
.
shape
[
-
1
]
if
max_gen_len
==
1
:
valid_sampled_token_ids
:
list
[
np
.
ndarray
]
=
[
row
for
row
in
self
.
sampled_token_ids_cpu
.
numpy
()
]
valid_sampled_token_ids
=
self
.
sampled_token_ids_cpu
.
tolist
()
else
:
valid_sampled_token_ids
=
RejectionSampler
.
parse_output
(
self
.
sampled_token_ids_cpu
,
self
.
vocab_size
,
)
for
i
in
self
.
_invalid_req_indices
:
valid_sampled_token_ids
[
i
]
=
np
.
array
([]
)
valid_sampled_token_ids
[
i
]
.
clear
(
)
output
=
self
.
_model_runner_output
output
.
sampled_token_ids
=
valid_sampled_token_ids
...
...
@@ -2466,7 +2464,7 @@ class GPUModelRunner(
)
->
tuple
[
dict
[
str
,
int
],
LogprobsLists
|
None
,
list
[
np
.
ndarray
],
list
[
list
[
int
]
],
dict
[
str
,
LogprobsTensors
|
None
],
list
[
str
],
dict
[
str
,
int
],
...
...
@@ -2492,7 +2490,6 @@ class GPUModelRunner(
num_sampled_tokens
=
sampler_output
.
sampled_token_ids
.
shape
[
0
]
sampled_token_ids
=
sampler_output
.
sampled_token_ids
invalid_req_indices
=
[]
valid_sampled_token_ids
:
list
[
np
.
ndarray
]
if
not
self
.
use_async_scheduling
:
# Get the valid generated tokens.
max_gen_len
=
sampled_token_ids
.
shape
[
-
1
]
...
...
@@ -2507,7 +2504,7 @@ class GPUModelRunner(
)
# Mask out the sampled tokens that should not be sampled.
for
i
in
discard_sampled_tokens_req_indices
:
valid_sampled_token_ids
[
int
(
i
)]
=
np
.
array
([]
)
valid_sampled_token_ids
[
int
(
i
)]
.
clear
(
)
else
:
valid_sampled_token_ids
=
[]
invalid_req_indices
=
discard_sampled_tokens_req_indices
.
tolist
()
...
...
@@ -2537,24 +2534,19 @@ class GPUModelRunner(
[
0
]
if
spec_decode_metadata
and
logprobs_tensors
else
None
)
for
req_idx
in
range
(
num_sampled_tokens
):
sampled_ids
:
np
.
ndarray
|
None
if
self
.
use_async_scheduling
:
sampled_ids
=
(
np
.
array
([
-
1
])
if
req_idx
not
in
invalid_req_indices_set
else
None
)
sampled_ids
=
[
-
1
]
if
req_idx
not
in
invalid_req_indices_set
else
None
else
:
sampled_ids
=
valid_sampled_token_ids
[
req_idx
]
num_sampled_ids
:
int
=
(
sampled_ids
.
shape
[
0
]
if
sampled_ids
is
not
None
else
0
)
num_sampled_ids
:
int
=
len
(
sampled_ids
)
if
sampled_ids
else
0
if
cu_num_accepted_tokens
is
not
None
:
cu_num_accepted_tokens
.
append
(
cu_num_accepted_tokens
[
-
1
]
+
num_sampled_ids
)
if
sampled_ids
is
None
or
num_
sampled_ids
==
0
:
if
not
sampled_ids
:
continue
start_idx
=
self
.
input_batch
.
num_tokens_no_spec
[
req_idx
]
...
...
@@ -2938,9 +2930,7 @@ class GPUModelRunner(
self
.
input_batch
.
prev_sampled_token_ids
=
None
def
propose_draft_token_ids
(
sampled_token_ids
:
torch
.
Tensor
|
list
[
np
.
ndarray
],
)
->
None
:
def
propose_draft_token_ids
(
sampled_token_ids
):
assert
spec_decode_common_attn_metadata
is
not
None
with
record_function_or_nullcontext
(
"gpu_model_runner: draft"
):
self
.
_draft_token_ids
=
self
.
propose_draft_token_ids
(
...
...
@@ -3113,14 +3103,14 @@ class GPUModelRunner(
def
propose_draft_token_ids
(
self
,
scheduler_output
:
"SchedulerOutput"
,
sampled_token_ids
:
torch
.
Tensor
|
list
[
np
.
ndarray
],
sampled_token_ids
:
torch
.
Tensor
|
list
[
list
[
int
]
],
sampling_metadata
:
SamplingMetadata
,
hidden_states
:
torch
.
Tensor
,
sample_hidden_states
:
torch
.
Tensor
,
aux_hidden_states
:
list
[
torch
.
Tensor
]
|
None
,
spec_decode_metadata
:
SpecDecodeMetadata
|
None
,
common_attn_metadata
:
CommonAttentionMetadata
,
)
->
torch
.
Tensor
|
list
[
list
[
int
]]:
)
->
list
[
list
[
int
]]
|
torch
.
Tensor
:
num_scheduled_tokens
=
scheduler_output
.
total_num_scheduled_tokens
spec_config
=
self
.
speculative_config
assert
spec_config
is
not
None
...
...
@@ -3154,7 +3144,7 @@ class GPUModelRunner(
for
num_draft
,
tokens
in
zip
(
spec_decode_metadata
.
num_draft_tokens
,
sampled_token_ids
):
indices
.
append
(
offset
+
tokens
.
shape
[
0
]
-
1
)
indices
.
append
(
offset
+
len
(
tokens
)
-
1
)
offset
+=
num_draft
+
1
indices
=
torch
.
tensor
(
indices
,
device
=
self
.
device
)
hidden_states
=
sample_hidden_states
[
indices
]
...
...
@@ -5150,7 +5140,7 @@ class GPUModelRunner(
return
kv_cache_spec
def
_to_list
(
self
,
sampled_token_ids
:
torch
.
Tensor
)
->
list
[
np
.
ndarray
]:
def
_to_list
(
self
,
sampled_token_ids
:
torch
.
Tensor
)
->
list
[
list
[
int
]
]:
# This is a short term mitigation for issue mentioned in
# https://github.com/vllm-project/vllm/issues/22754.
# `tolist` would trigger a cuda wise stream sync, which
...
...
@@ -5163,4 +5153,4 @@ class GPUModelRunner(
pinned
.
copy_
(
sampled_token_ids
,
non_blocking
=
True
)
self
.
transfer_event
.
record
()
self
.
transfer_event
.
synchronize
()
return
[
row
for
row
in
pinned
.
numpy
()
]
return
pinned
.
tolist
()
vllm/v1/worker/tpu_model_runner.py
View file @
30b9c677
...
...
@@ -1262,15 +1262,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
max_gen_len
=
selected_token_ids
.
shape
[
-
1
]
if
max_gen_len
==
1
:
valid_sampled_token_ids
:
list
[
np
.
ndarray
]
=
[
row
for
row
in
selected_token_ids
.
numpy
()
]
valid_sampled_token_ids
=
selected_token_ids
.
tolist
()
# Mask out the sampled tokens that should not be sampled.
# TODO: Keep in sync with gpu_model_runner.py, in particular
# the "else" case here
for
i
in
discard_sampled_tokens_req_indices
:
valid_sampled_token_ids
[
i
]
=
np
.
array
([]
)
valid_sampled_token_ids
[
i
]
.
clear
(
)
# Append sampled tokens
for
i
,
req_state
,
seq_len
in
request_seq_lens
:
...
...
@@ -1283,7 +1281,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
valid_mask
=
selected_token_ids
!=
INVALID_TOKEN_ID
gen_lens
=
valid_mask
.
sum
(
dim
=
1
).
tolist
()
valid_sampled_token_ids
=
[
seq
.
numpy
()
for
seq
in
selected_token_ids
[
valid_mask
].
split
(
gen_lens
)
seq
.
tolist
()
for
seq
in
selected_token_ids
[
valid_mask
].
split
(
gen_lens
)
]
self
.
input_batch
.
num_tokens
[:
num_reqs
]
+=
gen_lens
for
i
,
req_state
,
seq_len
in
request_seq_lens
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment