Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
30b9c677
Unverified
Commit
30b9c677
authored
Nov 20, 2025
by
Jialin Ouyang
Committed by
GitHub
Nov 20, 2025
Browse files
Revert "[Redo] #26368 (#28771)" (#29121)
Signed-off-by:
Jialin Ouyang
<
Jialin.Ouyang@gmail.com
>
parent
11857a00
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
99 additions
and
127 deletions
+99
-127
tests/v1/core/test_async_scheduler.py
tests/v1/core/test_async_scheduler.py
+1
-2
tests/v1/core/test_priority_scheduler_random.py
tests/v1/core/test_priority_scheduler_random.py
+2
-4
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+38
-50
tests/v1/kv_connector/unit/test_nixl_connector.py
tests/v1/kv_connector/unit/test_nixl_connector.py
+3
-4
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+1
-2
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+1
-4
tests/v1/spec_decode/test_ngram.py
tests/v1/spec_decode/test_ngram.py
+9
-9
vllm/utils/gc_utils.py
vllm/utils/gc_utils.py
+8
-5
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+2
-2
vllm/v1/outputs.py
vllm/v1/outputs.py
+2
-2
vllm/v1/sample/rejection_sampler.py
vllm/v1/sample/rejection_sampler.py
+5
-3
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+4
-3
vllm/v1/spec_decode/ngram_proposer.py
vllm/v1/spec_decode/ngram_proposer.py
+3
-3
vllm/v1/spec_decode/suffix_decoding.py
vllm/v1/spec_decode/suffix_decoding.py
+4
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+13
-23
vllm/v1/worker/tpu_model_runner.py
vllm/v1/worker/tpu_model_runner.py
+3
-5
No files found.
tests/v1/core/test_async_scheduler.py
View file @
30b9c677
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections
import
deque
from
collections
import
deque
import
numpy
as
np
import
pytest
import
pytest
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.core.sched.output
import
SchedulerOutput
...
@@ -22,7 +21,7 @@ def _make_model_runner_output(
...
@@ -22,7 +21,7 @@ def _make_model_runner_output(
return
ModelRunnerOutput
(
return
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_ids
=
req_ids
,
req_id_to_index
=
{
req_id
:
i
for
i
,
req_id
in
enumerate
(
req_ids
)},
req_id_to_index
=
{
req_id
:
i
for
i
,
req_id
in
enumerate
(
req_ids
)},
sampled_token_ids
=
[
np
.
array
(
[
i
]
)
for
i
in
range
(
len
(
req_ids
))],
sampled_token_ids
=
[[
i
]
for
i
in
range
(
len
(
req_ids
))],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
...
tests/v1/core/test_priority_scheduler_random.py
View file @
30b9c677
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
import
random
import
random
import
uuid
import
uuid
import
numpy
as
np
import
pytest
import
pytest
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
...
@@ -100,7 +99,8 @@ def _mock_execute_model(
...
@@ -100,7 +99,8 @@ def _mock_execute_model(
random
.
randint
(
*
num_output_tokens_range
)
for
_
in
range
(
len
(
request_ids
))
random
.
randint
(
*
num_output_tokens_range
)
for
_
in
range
(
len
(
request_ids
))
]
]
sampled_token_ids
=
[
sampled_token_ids
=
[
np
.
random
.
randint
(
0
,
100
,
size
=
num_tokens
)
for
num_tokens
in
num_output_tokens
[
random
.
randint
(
0
,
100
)
for
_
in
range
(
num_tokens
)]
for
num_tokens
in
num_output_tokens
]
]
return
ModelRunnerOutput
(
return
ModelRunnerOutput
(
...
@@ -196,8 +196,6 @@ def test_priority_scheduling_blast(
...
@@ -196,8 +196,6 @@ def test_priority_scheduling_blast(
num_blocks
:
int
,
num_blocks
:
int
,
):
):
random
.
seed
(
42
)
random
.
seed
(
42
)
np
.
random
.
seed
(
42
)
seen_request_prompt_length
=
dict
[
str
,
int
]()
seen_request_prompt_length
=
dict
[
str
,
int
]()
seen_request_ids
=
set
[
str
]()
seen_request_ids
=
set
[
str
]()
seen_mm_hashes
=
set
[
str
]()
seen_mm_hashes
=
set
[
str
]()
...
...
tests/v1/core/test_scheduler.py
View file @
30b9c677
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
import
dataclasses
import
dataclasses
from
unittest.mock
import
Mock
from
unittest.mock
import
Mock
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch
...
@@ -170,7 +169,7 @@ def test_schedule_partial_requests():
...
@@ -170,7 +169,7 @@ def test_schedule_partial_requests():
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
# Only the first request has a sampled token id because
# Only the first request has a sampled token id because
# the rest requests are still being prefilled.
# the rest requests are still being prefilled.
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
,
np
.
array
(
[]
)
,
np
.
array
(
[]
)
],
sampled_token_ids
=
[[
0
],
[],
[]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -217,7 +216,7 @@ def test_no_mm_input_chunking():
...
@@ -217,7 +216,7 @@ def test_no_mm_input_chunking():
model_runner_output
=
ModelRunnerOutput
(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[]
)
for
_
in
range
(
len
(
requests
))],
sampled_token_ids
=
[[]
for
_
in
range
(
len
(
requests
))],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -277,7 +276,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
...
@@ -277,7 +276,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
model_runner_output
=
ModelRunnerOutput
(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[]
)
for
_
in
range
(
len
(
requests
))],
sampled_token_ids
=
[[]
for
_
in
range
(
len
(
requests
))],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -301,8 +300,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
...
@@ -301,8 +300,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
model_runner_output
=
ModelRunnerOutput
(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
([
0
]),
np
.
array
([
0
])]
sampled_token_ids
=
[[
0
],
[
0
]]
+
[[]
for
_
in
range
(
len
(
requests
)
-
2
)],
+
[
np
.
array
([])
for
_
in
range
(
len
(
requests
)
-
2
)],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -349,8 +347,8 @@ def test_stop_via_update_from_output():
...
@@ -349,8 +347,8 @@ def test_stop_via_update_from_output():
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
sampled_token_ids
=
[
np
.
array
(
[
EOS_TOKEN_ID
]
)
,
[
EOS_TOKEN_ID
],
np
.
array
(
[
10
,
11
]
)
,
[
10
,
11
],
],
# First request hits EOS, second continues
],
# First request hits EOS, second continues
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -394,10 +392,7 @@ def test_stop_via_update_from_output():
...
@@ -394,10 +392,7 @@ def test_stop_via_update_from_output():
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
sampled_token_ids
=
[[
10
,
42
,
12
],
[
13
,
14
]],
# First request hits stop token
np
.
array
([
10
,
42
,
12
]),
np
.
array
([
13
,
14
]),
],
# First request hits stop token
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -441,10 +436,7 @@ def test_stop_via_update_from_output():
...
@@ -441,10 +436,7 @@ def test_stop_via_update_from_output():
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
sampled_token_ids
=
[[
10
,
11
,
12
],
[
13
]],
# First request exceeds max_tokens
np
.
array
([
10
,
11
,
12
]),
np
.
array
([
13
]),
],
# First request exceeds max_tokens
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -483,7 +475,7 @@ def test_stop_via_update_from_output():
...
@@ -483,7 +475,7 @@ def test_stop_via_update_from_output():
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
0
].
request_id
],
req_ids
=
[
requests
[
0
].
request_id
],
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
EOS_TOKEN_ID
,
10
,
11
]
)
],
sampled_token_ids
=
[[
EOS_TOKEN_ID
,
10
,
11
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -624,7 +616,7 @@ def test_schedule_concurrent_batches(
...
@@ -624,7 +616,7 @@ def test_schedule_concurrent_batches(
model_runner_output
=
ModelRunnerOutput
(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
0
].
request_id
],
req_ids
=
[
requests
[
0
].
request_id
],
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -641,7 +633,7 @@ def test_schedule_concurrent_batches(
...
@@ -641,7 +633,7 @@ def test_schedule_concurrent_batches(
model_runner_output
=
ModelRunnerOutput
(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
1
].
request_id
],
req_ids
=
[
requests
[
1
].
request_id
],
req_id_to_index
=
{
requests
[
1
].
request_id
:
0
},
req_id_to_index
=
{
requests
[
1
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -678,7 +670,7 @@ def test_preempt_during_execution():
...
@@ -678,7 +670,7 @@ def test_preempt_during_execution():
model_runner_output0
=
ModelRunnerOutput
(
model_runner_output0
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
0
].
request_id
],
req_ids
=
[
requests
[
0
].
request_id
],
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -695,7 +687,7 @@ def test_preempt_during_execution():
...
@@ -695,7 +687,7 @@ def test_preempt_during_execution():
model_runner_output1
=
ModelRunnerOutput
(
model_runner_output1
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
1
].
request_id
],
req_ids
=
[
requests
[
1
].
request_id
],
req_id_to_index
=
{
requests
[
1
].
request_id
:
0
},
req_id_to_index
=
{
requests
[
1
].
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
42
]
)
],
sampled_token_ids
=
[[
42
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -712,18 +704,14 @@ def test_preempt_during_execution():
...
@@ -712,18 +704,14 @@ def test_preempt_during_execution():
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"spec_tokens,output_tokens,expected"
,
"spec_tokens,output_tokens,expected"
,
[
[
([[
1
,
2
,
3
]],
[
np
.
array
([
1
,
2
,
3
,
4
])],
(
1
,
3
,
3
,
[
1
,
1
,
1
])),
# perfect match
([[
1
,
2
,
3
]],
[[
1
,
2
,
3
,
4
]],
(
1
,
3
,
3
,
[
1
,
1
,
1
])),
# perfect match
([[
1
,
2
,
3
]],
[
np
.
array
([
1
,
5
])],
(
1
,
3
,
1
,
[
1
,
0
,
0
])),
# early mismatch
([[
1
,
2
,
3
]],
[[
1
,
5
]],
(
1
,
3
,
1
,
[
1
,
0
,
0
])),
# early mismatch
(
([[
1
,
2
],
[
3
]],
[[
1
,
2
,
5
],
[
3
,
4
]],
(
2
,
3
,
3
,
[
2
,
1
])),
# multiple sequences
[[
1
,
2
],
[
3
]],
([[
1
]],
[[
1
,
2
]],
(
1
,
1
,
1
,
[
1
])),
# single token sequence
[
np
.
array
([
1
,
2
,
5
]),
np
.
array
([
3
,
4
])],
([[]],
[[
5
]],
(
0
,
0
,
0
,
[
0
])),
# empty sequence
(
2
,
3
,
3
,
[
2
,
1
]),
),
# multiple sequences
([[
1
]],
[
np
.
array
([
1
,
2
])],
(
1
,
1
,
1
,
[
1
])),
# single token sequence
([[]],
[
np
.
array
([
5
])],
(
0
,
0
,
0
,
[
0
])),
# empty sequence
(
(
[[
1
,
2
,
3
],
[
4
,
5
,
6
]],
[[
1
,
2
,
3
],
[
4
,
5
,
6
]],
[
np
.
array
(
[
1
,
2
,
7
]
)
,
np
.
array
(
[
4
,
8
]
)
],
[[
1
,
2
,
7
],
[
4
,
8
]],
(
2
,
6
,
3
,
[
2
,
1
,
0
]),
(
2
,
6
,
3
,
[
2
,
1
,
0
]),
),
# multiple mismatches
),
# multiple mismatches
],
],
...
@@ -757,7 +745,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
...
@@ -757,7 +745,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
model_runner_output
=
ModelRunnerOutput
(
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
for
_
in
range
(
len
(
requests
))],
sampled_token_ids
=
[[
0
]
for
_
in
range
(
len
(
requests
))],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -984,7 +972,7 @@ def test_kv_connector_basic(is_async: bool):
...
@@ -984,7 +972,7 @@ def test_kv_connector_basic(is_async: bool):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -1037,7 +1025,7 @@ def test_kv_connector_basic(is_async: bool):
...
@@ -1037,7 +1025,7 @@ def test_kv_connector_basic(is_async: bool):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -1100,7 +1088,7 @@ def test_external_prefix_cache_metrics():
...
@@ -1100,7 +1088,7 @@ def test_external_prefix_cache_metrics():
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
[
r
.
request_id
for
r
in
requests
],
req_ids
=
[
r
.
request_id
for
r
in
requests
],
req_id_to_index
=
{
r
.
request_id
:
i
for
i
,
r
in
enumerate
(
requests
)},
req_id_to_index
=
{
r
.
request_id
:
i
for
i
,
r
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
NUM_REQUESTS
,
sampled_token_ids
=
[[
1000
]]
*
NUM_REQUESTS
,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -1166,7 +1154,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
...
@@ -1166,7 +1154,7 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -1251,7 +1239,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
...
@@ -1251,7 +1239,7 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -1344,7 +1332,7 @@ def make_output(scheduler: Scheduler):
...
@@ -1344,7 +1332,7 @@ def make_output(scheduler: Scheduler):
return
ModelRunnerOutput
(
return
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
scheduler
.
running
],
req_ids
=
[
req
.
request_id
for
req
in
scheduler
.
running
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
scheduler
.
running
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
scheduler
.
running
)},
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
scheduler
.
running
),
sampled_token_ids
=
[[
1000
]]
*
len
(
scheduler
.
running
),
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -1761,7 +1749,7 @@ def test_priority_scheduling_preemption():
...
@@ -1761,7 +1749,7 @@ def test_priority_scheduling_preemption():
req_id_to_index
=
{
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
low_priority_requests
)
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
low_priority_requests
)
},
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
low_priority_requests
],
sampled_token_ids
=
[[
100
]
for
_
in
low_priority_requests
],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -1830,7 +1818,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
...
@@ -1830,7 +1818,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
req_id_to_index
=
{
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
low_priority_requests
)
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
low_priority_requests
)
},
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
low_priority_requests
],
sampled_token_ids
=
[[
100
]
for
_
in
low_priority_requests
],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -2076,7 +2064,7 @@ def test_priority_scheduling_heap_property():
...
@@ -2076,7 +2064,7 @@ def test_priority_scheduling_heap_property():
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
req_id
],
req_ids
=
[
req
.
req_id
],
req_id_to_index
=
{
req
.
req_id
:
0
},
req_id_to_index
=
{
req
.
req_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -2162,7 +2150,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
...
@@ -2162,7 +2150,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request_low
.
request_id
],
req_ids
=
[
request_low
.
request_id
],
req_id_to_index
=
{
request_low
.
request_id
:
0
},
req_id_to_index
=
{
request_low
.
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -2193,7 +2181,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
...
@@ -2193,7 +2181,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
requests
],
sampled_token_ids
=
[[
100
]
for
_
in
requests
],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -2219,7 +2207,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
...
@@ -2219,7 +2207,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[]
)
,
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[],
[
100
]],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -2636,7 +2624,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
...
@@ -2636,7 +2624,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request1
.
request_id
],
req_ids
=
[
request1
.
request_id
],
req_id_to_index
=
{
request1
.
request_id
:
0
},
req_id_to_index
=
{
request1
.
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -2842,7 +2830,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
...
@@ -2842,7 +2830,7 @@ def test_ec_connector_unable_to_allocate(use_kv_connector):
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[
np
.
array
(
[
1000
]
)
]
*
len
(
req_ids
),
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[],
pooler_output
=
[],
...
@@ -2955,7 +2943,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
...
@@ -2955,7 +2943,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request_low
.
request_id
],
req_ids
=
[
request_low
.
request_id
],
req_id_to_index
=
{
request_low
.
request_id
:
0
},
req_id_to_index
=
{
request_low
.
request_id
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
],
sampled_token_ids
=
[[
100
]],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -3006,7 +2994,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
...
@@ -3006,7 +2994,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
for
_
in
requests
],
sampled_token_ids
=
[[
100
]
for
_
in
requests
],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -3041,7 +3029,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
...
@@ -3041,7 +3029,7 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
,
np
.
array
(
[
100
,
200
]
)
],
sampled_token_ids
=
[[
100
],
[
100
,
200
]],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
@@ -3227,7 +3215,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
...
@@ -3227,7 +3215,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
request1
.
request_id
,
request2
.
request_id
],
req_ids
=
[
request1
.
request_id
,
request2
.
request_id
],
req_id_to_index
=
{
request1
.
request_id
:
0
,
request2
.
request_id
:
1
},
req_id_to_index
=
{
request1
.
request_id
:
0
,
request2
.
request_id
:
1
},
sampled_token_ids
=
[
np
.
array
(
[
100
]
)
,
np
.
array
(
[
121
]
)
],
sampled_token_ids
=
[[
100
],
[
121
]],
# spec_token_ids=None,
# spec_token_ids=None,
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
...
...
tests/v1/kv_connector/unit/test_nixl_connector.py
View file @
30b9c677
...
@@ -11,7 +11,6 @@ import uuid
...
@@ -11,7 +11,6 @@ import uuid
from
collections
import
defaultdict
from
collections
import
defaultdict
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
numpy
as
np
import
pytest
import
pytest
import
ray
import
ray
import
torch
import
torch
...
@@ -827,7 +826,7 @@ def test_kv_connector_stats_aggregation():
...
@@ -827,7 +826,7 @@ def test_kv_connector_stats_aggregation():
output
=
ModelRunnerOutput
(
output
=
ModelRunnerOutput
(
req_ids
=
[
f
"req_
{
i
}
"
],
req_ids
=
[
f
"req_
{
i
}
"
],
req_id_to_index
=
{
f
"req_
{
i
}
"
:
0
},
req_id_to_index
=
{
f
"req_
{
i
}
"
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
123
]
)
],
# dummy token
sampled_token_ids
=
[[
123
]],
# dummy token
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[
None
],
pooler_output
=
[
None
],
...
@@ -908,7 +907,7 @@ def test_multi_kv_connector_stats_aggregation():
...
@@ -908,7 +907,7 @@ def test_multi_kv_connector_stats_aggregation():
output
=
ModelRunnerOutput
(
output
=
ModelRunnerOutput
(
req_ids
=
[
f
"req_
{
i
}
"
],
req_ids
=
[
f
"req_
{
i
}
"
],
req_id_to_index
=
{
f
"req_
{
i
}
"
:
0
},
req_id_to_index
=
{
f
"req_
{
i
}
"
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
123
]
)
],
sampled_token_ids
=
[[
123
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[
None
],
pooler_output
=
[
None
],
...
@@ -966,7 +965,7 @@ def test_scheduler_kv_connector_stats_aggregation():
...
@@ -966,7 +965,7 @@ def test_scheduler_kv_connector_stats_aggregation():
model_output
=
ModelRunnerOutput
(
model_output
=
ModelRunnerOutput
(
req_ids
=
[
"req_0"
],
req_ids
=
[
"req_0"
],
req_id_to_index
=
{
"req_0"
:
0
},
req_id_to_index
=
{
"req_0"
:
0
},
sampled_token_ids
=
[
np
.
array
(
[
123
]
)
],
sampled_token_ids
=
[[
123
]],
logprobs
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
prompt_logprobs_dict
=
{},
pooler_output
=
[
None
],
pooler_output
=
[
None
],
...
...
tests/v1/kv_connector/unit/utils.py
View file @
30b9c677
...
@@ -7,7 +7,6 @@ from dataclasses import dataclass
...
@@ -7,7 +7,6 @@ from dataclasses import dataclass
from
itertools
import
chain
,
count
from
itertools
import
chain
,
count
from
typing
import
Any
from
typing
import
Any
import
numpy
as
np
import
torch
import
torch
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
...
@@ -229,7 +228,7 @@ def create_model_runner_output(
...
@@ -229,7 +228,7 @@ def create_model_runner_output(
# Make sampled tokens.
# Make sampled tokens.
sampled_token
=
EOS_TOKEN_ID
if
use_eos
else
token_id
sampled_token
=
EOS_TOKEN_ID
if
use_eos
else
token_id
sampled_token_ids
=
[
np
.
array
(
[
sampled_token
]
)
for
_
in
req_ids
]
sampled_token_ids
=
[[
sampled_token
]
for
_
in
req_ids
]
kv_connector_output
=
(
kv_connector_output
=
(
None
None
...
...
tests/v1/spec_decode/test_eagle.py
View file @
30b9c677
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
from
unittest
import
mock
from
unittest
import
mock
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch
...
@@ -113,9 +112,7 @@ def test_prepare_next_token_ids():
...
@@ -113,9 +112,7 @@ def test_prepare_next_token_ids():
sampled_token_ids_tensor
=
torch
.
tensor
(
sampled_token_ids_tensor
=
torch
.
tensor
(
sampled_token_ids
,
dtype
=
torch
.
int32
,
device
=
device
sampled_token_ids
,
dtype
=
torch
.
int32
,
device
=
device
)
)
sampled_token_ids_cpu
=
[
sampled_token_ids_cpu
=
[[
i
for
i
in
seq
if
i
!=
-
1
]
for
seq
in
sampled_token_ids
]
np
.
array
([
i
for
i
in
seq
if
i
!=
-
1
])
for
seq
in
sampled_token_ids
]
expected_next_token_ids_cpu
=
[
1
,
4
,
30
,
40
]
expected_next_token_ids_cpu
=
[
1
,
4
,
30
,
40
]
expected_next_token_ids_tensor
=
torch
.
tensor
(
expected_next_token_ids_tensor
=
torch
.
tensor
(
...
...
tests/v1/spec_decode/test_ngram.py
View file @
30b9c677
...
@@ -77,7 +77,7 @@ def test_ngram_proposer():
...
@@ -77,7 +77,7 @@ def test_ngram_proposer():
# No match.
# No match.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
5
]])
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
5
]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -88,7 +88,7 @@ def test_ngram_proposer():
...
@@ -88,7 +88,7 @@ def test_ngram_proposer():
# No match for 4-gram.
# No match for 4-gram.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
1
,
2
,
3
]])
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
1
,
2
,
3
]])
result
=
get_ngram_proposer
(
min_n
=
4
,
max_n
=
4
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
4
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -99,7 +99,7 @@ def test_ngram_proposer():
...
@@ -99,7 +99,7 @@ def test_ngram_proposer():
# No match for 4-gram but match for 3-gram.
# No match for 4-gram but match for 3-gram.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
1
,
2
,
3
]])
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
4
,
1
,
2
,
3
]])
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
4
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -111,7 +111,7 @@ def test_ngram_proposer():
...
@@ -111,7 +111,7 @@ def test_ngram_proposer():
# In this case, the proposer should return the 4-gram match.
# In this case, the proposer should return the 4-gram match.
token_ids_cpu
=
np
.
array
([[
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
1
,
2
,
3
,
4
]])
token_ids_cpu
=
np
.
array
([[
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
1
,
2
,
3
,
4
]])
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
4
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -122,7 +122,7 @@ def test_ngram_proposer():
...
@@ -122,7 +122,7 @@ def test_ngram_proposer():
# Match for 2-gram and 3-gram, but not 4-gram.
# Match for 2-gram and 3-gram, but not 4-gram.
token_ids_cpu
=
np
.
array
([[
3
,
4
,
5
,
2
,
3
,
4
,
1
,
2
,
3
,
4
]])
token_ids_cpu
=
np
.
array
([[
3
,
4
,
5
,
2
,
3
,
4
,
1
,
2
,
3
,
4
]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
4
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
4
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -133,7 +133,7 @@ def test_ngram_proposer():
...
@@ -133,7 +133,7 @@ def test_ngram_proposer():
# Multiple 3-gram matched, but always pick the first one.
# Multiple 3-gram matched, but always pick the first one.
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
100
,
1
,
2
,
3
,
200
,
1
,
2
,
3
,
300
,
1
,
2
,
3
]])
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
100
,
1
,
2
,
3
,
200
,
1
,
2
,
3
,
300
,
1
,
2
,
3
]])
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
3
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
3
,
max_n
=
3
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -144,7 +144,7 @@ def test_ngram_proposer():
...
@@ -144,7 +144,7 @@ def test_ngram_proposer():
# check empty input
# check empty input
token_ids_cpu
=
np
.
array
([[]])
token_ids_cpu
=
np
.
array
([[]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
],
sampled_token_ids
=
[[
0
]],
req_ids
=
[
"0"
],
req_ids
=
[
"0"
],
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
num_tokens_no_spec
=
np
.
array
([
len
(
c
)
for
c
in
token_ids_cpu
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -157,7 +157,7 @@ def test_ngram_proposer():
...
@@ -157,7 +157,7 @@ def test_ngram_proposer():
# second request has 3 tokens and no match. Padded with -1 for max len 5
# second request has 3 tokens and no match. Padded with -1 for max len 5
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
1
,
2
],
[
4
,
5
,
6
,
-
1
,
-
1
]])
token_ids_cpu
=
np
.
array
([[
1
,
2
,
3
,
1
,
2
],
[
4
,
5
,
6
,
-
1
,
-
1
]])
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
result
=
get_ngram_proposer
(
min_n
=
2
,
max_n
=
2
,
k
=
2
).
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
,
np
.
array
(
[
1
]
)
],
sampled_token_ids
=
[[
0
],
[
1
]],
req_ids
=
[
"0"
,
"1"
],
req_ids
=
[
"0"
,
"1"
],
num_tokens_no_spec
=
np
.
array
([
5
,
3
]),
num_tokens_no_spec
=
np
.
array
([
5
,
3
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
@@ -181,7 +181,7 @@ def test_ngram_proposer():
...
@@ -181,7 +181,7 @@ def test_ngram_proposer():
input_2
[:
3
]
=
[
4
,
5
,
6
]
input_2
[:
3
]
=
[
4
,
5
,
6
]
token_ids_cpu
=
np
.
array
([
input_1
,
input_2
])
token_ids_cpu
=
np
.
array
([
input_1
,
input_2
])
result
=
ngram_proposer
.
propose
(
result
=
ngram_proposer
.
propose
(
sampled_token_ids
=
[
np
.
array
(
[
0
]
)
,
np
.
array
(
[
1
]
)
],
sampled_token_ids
=
[[
0
],
[
1
]],
req_ids
=
[
"0"
,
"1"
],
req_ids
=
[
"0"
,
"1"
],
num_tokens_no_spec
=
np
.
array
([
len
(
input_1
),
3
]),
num_tokens_no_spec
=
np
.
array
([
len
(
input_1
),
3
]),
token_ids_cpu
=
token_ids_cpu
,
token_ids_cpu
=
token_ids_cpu
,
...
...
vllm/utils/gc_utils.py
View file @
30b9c677
...
@@ -53,6 +53,7 @@ class GCDebugger:
...
@@ -53,6 +53,7 @@ class GCDebugger:
self
.
config
=
config
self
.
config
=
config
# Start time in micro second of this GC cycle
# Start time in micro second of this GC cycle
self
.
start_time_ns
:
int
=
time
.
monotonic_ns
()
self
.
start_time_ns
:
int
=
time
.
monotonic_ns
()
self
.
num_objects
:
int
=
0
# If config.top_objects is positive,
# If config.top_objects is positive,
# compute top collected objects by object types
# compute top collected objects by object types
self
.
gc_top_collected_objects
:
str
=
""
self
.
gc_top_collected_objects
:
str
=
""
...
@@ -68,19 +69,21 @@ class GCDebugger:
...
@@ -68,19 +69,21 @@ class GCDebugger:
# Before GC started, record GC start time
# Before GC started, record GC start time
# and top collected objects
# and top collected objects
self
.
start_time_ns
=
time
.
monotonic_ns
()
self
.
start_time_ns
=
time
.
monotonic_ns
()
if
(
top_objects
:
=
self
.
config
.
top_objects
)
>
0
:
objects
=
gc
.
get_objects
(
generation
)
self
.
gc_top_collected_objects
=
_compute_top_gc_collected_objects
(
self
.
num_objects
=
len
(
objects
)
gc
.
get_objects
(
generation
),
top_objects
self
.
gc_top_collected_objects
=
_compute_top_gc_collected_objects
(
)
objects
,
self
.
config
.
top_objects
)
elif
phase
==
"stop"
:
elif
phase
==
"stop"
:
# After GC finished, Record GC elapsed time and
# After GC finished, Record GC elapsed time and
# optionally top collected objects
# optionally top collected objects
elpased_ms
=
(
time
.
monotonic_ns
()
-
self
.
start_time_ns
)
/
1e6
elpased_ms
=
(
time
.
monotonic_ns
()
-
self
.
start_time_ns
)
/
1e6
logger
.
info
(
logger
.
info
(
"GC took %.3fms to complete. "
"GC took %.3fms to complete. "
"Collected %s objects in GC generation %d.%s"
,
"Collected %s objects
(out of %d)
in GC generation %d.%s"
,
elpased_ms
,
elpased_ms
,
str
(
info
.
get
(
"collected"
,
"?"
)),
str
(
info
.
get
(
"collected"
,
"?"
)),
self
.
num_objects
,
generation
,
generation
,
(
(
f
" Top collected objects:
\n
{
self
.
gc_top_collected_objects
}
"
f
" Top collected objects:
\n
{
self
.
gc_top_collected_objects
}
"
...
...
vllm/v1/core/sched/scheduler.py
View file @
30b9c677
...
@@ -1013,8 +1013,8 @@ class Scheduler(SchedulerInterface):
...
@@ -1013,8 +1013,8 @@ class Scheduler(SchedulerInterface):
continue
continue
req_index
=
model_runner_output
.
req_id_to_index
[
req_id
]
req_index
=
model_runner_output
.
req_id_to_index
[
req_id
]
generated_token_ids
:
list
[
int
]
=
(
generated_token_ids
=
(
sampled_token_ids
[
req_index
]
.
tolist
()
if
sampled_token_ids
else
[]
sampled_token_ids
[
req_index
]
if
sampled_token_ids
else
[]
)
)
scheduled_spec_token_ids
=
(
scheduled_spec_token_ids
=
(
...
...
vllm/v1/outputs.py
View file @
30b9c677
...
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
...
@@ -158,7 +158,7 @@ class ModelRunnerOutput:
# num_generated_tokens is the number of tokens
# num_generated_tokens is the number of tokens
# generated in the current step. It can be different for
# generated in the current step. It can be different for
# each request due to speculative/jump decoding.
# each request due to speculative/jump decoding.
sampled_token_ids
:
list
[
np
.
ndarray
]
sampled_token_ids
:
list
[
list
[
int
]
]
# [num_reqs, max_num_logprobs + 1]
# [num_reqs, max_num_logprobs + 1]
# [num_reqs, max_num_logprobs + 1]
# [num_reqs, max_num_logprobs + 1]
...
@@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output(
...
@@ -220,7 +220,7 @@ def make_empty_encoder_model_runner_output(
req_id_to_index
:
dict
[
str
,
int
]
=
{
rid
:
idx
for
idx
,
rid
in
enumerate
(
req_ids
)}
req_id_to_index
:
dict
[
str
,
int
]
=
{
rid
:
idx
for
idx
,
rid
in
enumerate
(
req_ids
)}
# No tokens generated yet ⇒ one empty list per request
# No tokens generated yet ⇒ one empty list per request
sampled_token_ids
:
list
[
list
[
int
]]
=
[
np
.
array
(
[
0
]
)
for
_
in
req_ids
]
sampled_token_ids
:
list
[
list
[
int
]]
=
[[
0
]
for
_
in
req_ids
]
# Pooler outputs are not available yet ⇒ use None placeholders
# Pooler outputs are not available yet ⇒ use None placeholders
pooler_output
:
list
[
torch
.
Tensor
|
None
]
=
[
None
for
_
in
req_ids
]
pooler_output
:
list
[
torch
.
Tensor
|
None
]
=
[
None
for
_
in
req_ids
]
...
...
vllm/v1/sample/rejection_sampler.py
View file @
30b9c677
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
from
dataclasses
import
replace
from
dataclasses
import
replace
import
numpy
as
np
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
...
@@ -205,7 +204,7 @@ class RejectionSampler(nn.Module):
...
@@ -205,7 +204,7 @@ class RejectionSampler(nn.Module):
def
parse_output
(
def
parse_output
(
output_token_ids
:
torch
.
Tensor
,
output_token_ids
:
torch
.
Tensor
,
vocab_size
:
int
,
vocab_size
:
int
,
)
->
list
[
np
.
ndarray
]:
)
->
list
[
list
[
int
]
]:
"""Parse the output of the rejection sampler.
"""Parse the output of the rejection sampler.
Args:
Args:
output_token_ids: The sampled token IDs in shape
output_token_ids: The sampled token IDs in shape
...
@@ -221,7 +220,10 @@ class RejectionSampler(nn.Module):
...
@@ -221,7 +220,10 @@ class RejectionSampler(nn.Module):
valid_mask
=
(
output_token_ids_np
!=
PLACEHOLDER_TOKEN_ID
)
&
(
valid_mask
=
(
output_token_ids_np
!=
PLACEHOLDER_TOKEN_ID
)
&
(
output_token_ids_np
<
vocab_size
output_token_ids_np
<
vocab_size
)
)
return
[
row
[
valid_mask
[
i
]]
for
i
,
row
in
enumerate
(
output_token_ids_np
)]
outputs
=
[
row
[
valid_mask
[
i
]].
tolist
()
for
i
,
row
in
enumerate
(
output_token_ids_np
)
]
return
outputs
def
apply_logits_processors
(
def
apply_logits_processors
(
self
,
self
,
...
...
vllm/v1/spec_decode/eagle.py
View file @
30b9c677
...
@@ -496,7 +496,7 @@ class EagleProposer:
...
@@ -496,7 +496,7 @@ class EagleProposer:
def
prepare_next_token_ids_cpu
(
def
prepare_next_token_ids_cpu
(
self
,
self
,
sampled_token_ids
:
list
[
np
.
ndarray
],
sampled_token_ids
:
list
[
list
[
int
]
],
requests
:
dict
[
str
,
CachedRequestState
],
requests
:
dict
[
str
,
CachedRequestState
],
gpu_input_batch
:
InputBatch
,
gpu_input_batch
:
InputBatch
,
num_scheduled_tokens
:
dict
[
str
,
int
],
num_scheduled_tokens
:
dict
[
str
,
int
],
...
@@ -511,7 +511,7 @@ class EagleProposer:
...
@@ -511,7 +511,7 @@ class EagleProposer:
req_ids
=
gpu_input_batch
.
req_ids
req_ids
=
gpu_input_batch
.
req_ids
next_token_ids
:
list
[
int
]
=
[]
next_token_ids
:
list
[
int
]
=
[]
for
i
,
token_ids
in
enumerate
(
sampled_token_ids
):
for
i
,
token_ids
in
enumerate
(
sampled_token_ids
):
if
token_ids
.
shape
[
0
]
>
0
:
if
token_ids
:
# Common case.
# Common case.
next_token_id
=
token_ids
[
-
1
]
next_token_id
=
token_ids
[
-
1
]
else
:
else
:
...
@@ -522,9 +522,10 @@ class EagleProposer:
...
@@ -522,9 +522,10 @@ class EagleProposer:
seq_len
=
req_state
.
num_computed_tokens
+
num_scheduled_tokens
[
req_id
]
seq_len
=
req_state
.
num_computed_tokens
+
num_scheduled_tokens
[
req_id
]
next_token_id
=
req_state
.
get_token_id
(
seq_len
)
next_token_id
=
req_state
.
get_token_id
(
seq_len
)
next_token_ids
.
append
(
next_token_id
)
next_token_ids
.
append
(
next_token_id
)
return
torch
.
tensor
(
next_token_ids
=
torch
.
tensor
(
next_token_ids
,
dtype
=
torch
.
int32
,
device
=
self
.
input_ids
.
device
next_token_ids
,
dtype
=
torch
.
int32
,
device
=
self
.
input_ids
.
device
)
)
return
next_token_ids
def
prepare_next_token_ids_padded
(
def
prepare_next_token_ids_padded
(
self
,
self
,
...
...
vllm/v1/spec_decode/ngram_proposer.py
View file @
30b9c677
...
@@ -54,7 +54,7 @@ class NgramProposer:
...
@@ -54,7 +54,7 @@ class NgramProposer:
# Trigger Numba JIT compilation for N-gram proposer.
# Trigger Numba JIT compilation for N-gram proposer.
# This usually takes less than 1 second.
# This usually takes less than 1 second.
self
.
propose
(
self
.
propose
(
[
np
.
array
(
[]
)
]
*
1024
,
[[]]
*
1024
,
[
""
]
*
1024
,
[
""
]
*
1024
,
np
.
zeros
(
1024
,
dtype
=
np
.
int32
),
np
.
zeros
(
1024
,
dtype
=
np
.
int32
),
np
.
zeros
((
1024
,
self
.
max_model_len
),
dtype
=
np
.
int32
),
np
.
zeros
((
1024
,
self
.
max_model_len
),
dtype
=
np
.
int32
),
...
@@ -131,7 +131,7 @@ class NgramProposer:
...
@@ -131,7 +131,7 @@ class NgramProposer:
def
propose
(
def
propose
(
self
,
self
,
sampled_token_ids
:
list
[
np
.
ndarray
],
sampled_token_ids
:
list
[
list
[
int
]
],
req_ids
:
list
[
str
],
req_ids
:
list
[
str
],
num_tokens_no_spec
:
np
.
ndarray
,
num_tokens_no_spec
:
np
.
ndarray
,
token_ids_cpu
:
np
.
ndarray
,
token_ids_cpu
:
np
.
ndarray
,
...
@@ -140,7 +140,7 @@ class NgramProposer:
...
@@ -140,7 +140,7 @@ class NgramProposer:
# find which requests need ngram proposals
# find which requests need ngram proposals
valid_ngram_requests
=
[]
valid_ngram_requests
=
[]
for
i
,
sampled_ids
in
enumerate
(
sampled_token_ids
):
for
i
,
sampled_ids
in
enumerate
(
sampled_token_ids
):
num_sampled_ids
=
sampled_ids
.
shape
[
0
]
num_sampled_ids
=
len
(
sampled_ids
)
if
not
num_sampled_ids
:
if
not
num_sampled_ids
:
# Skip speculative decoding.
# Skip speculative decoding.
continue
continue
...
...
vllm/v1/spec_decode/suffix_decoding.py
View file @
30b9c677
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
numpy
as
np
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.v1.worker.gpu_input_batch
import
InputBatch
from
vllm.v1.worker.gpu_input_batch
import
InputBatch
...
@@ -34,16 +32,16 @@ class SuffixDecodingProposer:
...
@@ -34,16 +32,16 @@ class SuffixDecodingProposer:
def
propose
(
def
propose
(
self
,
self
,
input_batch
:
InputBatch
,
input_batch
:
InputBatch
,
sampled_token_ids
:
list
[
np
.
ndarray
],
sampled_token_ids
:
list
[
list
[
int
]
],
)
->
list
[
list
[
int
]]:
)
->
list
[
list
[
int
]]:
"""
"""
Propose speculative tokens for each request in the input batch. Suffix Decoding
Propose speculative tokens for each request in the input batch. Suffix Decoding
will speculate a dynamic number of tokens for each request every decoding step,
will speculate a dynamic number of tokens for each request every decoding step,
so each entry in the returned list may have different lengths.
so each entry in the returned list may have different lengths.
"""
"""
draft_token_ids
:
list
[
np
.
ndarray
]
=
[]
draft_token_ids
:
list
[
list
[
int
]
]
=
[]
for
i
,
sampled_ids
in
enumerate
(
sampled_token_ids
):
for
i
,
sampled_ids
in
enumerate
(
sampled_token_ids
):
if
sampled_ids
.
shape
[
0
]
==
0
:
if
not
sampled_ids
:
# Skip speculative decoding for partial prefills.
# Skip speculative decoding for partial prefills.
draft_token_ids
.
append
([])
draft_token_ids
.
append
([])
continue
continue
...
@@ -72,7 +70,7 @@ class SuffixDecodingProposer:
...
@@ -72,7 +70,7 @@ class SuffixDecodingProposer:
self
.
suffix_cache
.
start_request
(
req_id
,
prompt_token_ids
)
self
.
suffix_cache
.
start_request
(
req_id
,
prompt_token_ids
)
# Append the newly sampled ids to the suffix cache for this request.
# Append the newly sampled ids to the suffix cache for this request.
self
.
suffix_cache
.
add_active_response
(
req_id
,
sampled_ids
.
tolist
()
)
self
.
suffix_cache
.
add_active_response
(
req_id
,
sampled_ids
)
# Suffix decoding only uses the most recent tokens up to max_tree_depth, so
# Suffix decoding only uses the most recent tokens up to max_tree_depth, so
# we extract the pattern from the end of the input.
# we extract the pattern from the end of the input.
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
30b9c677
...
@@ -221,16 +221,14 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
...
@@ -221,16 +221,14 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
del
self
.
_sampled_token_ids
del
self
.
_sampled_token_ids
max_gen_len
=
self
.
sampled_token_ids_cpu
.
shape
[
-
1
]
max_gen_len
=
self
.
sampled_token_ids_cpu
.
shape
[
-
1
]
if
max_gen_len
==
1
:
if
max_gen_len
==
1
:
valid_sampled_token_ids
:
list
[
np
.
ndarray
]
=
[
valid_sampled_token_ids
=
self
.
sampled_token_ids_cpu
.
tolist
()
row
for
row
in
self
.
sampled_token_ids_cpu
.
numpy
()
]
else
:
else
:
valid_sampled_token_ids
=
RejectionSampler
.
parse_output
(
valid_sampled_token_ids
=
RejectionSampler
.
parse_output
(
self
.
sampled_token_ids_cpu
,
self
.
sampled_token_ids_cpu
,
self
.
vocab_size
,
self
.
vocab_size
,
)
)
for
i
in
self
.
_invalid_req_indices
:
for
i
in
self
.
_invalid_req_indices
:
valid_sampled_token_ids
[
i
]
=
np
.
array
([]
)
valid_sampled_token_ids
[
i
]
.
clear
(
)
output
=
self
.
_model_runner_output
output
=
self
.
_model_runner_output
output
.
sampled_token_ids
=
valid_sampled_token_ids
output
.
sampled_token_ids
=
valid_sampled_token_ids
...
@@ -2466,7 +2464,7 @@ class GPUModelRunner(
...
@@ -2466,7 +2464,7 @@ class GPUModelRunner(
)
->
tuple
[
)
->
tuple
[
dict
[
str
,
int
],
dict
[
str
,
int
],
LogprobsLists
|
None
,
LogprobsLists
|
None
,
list
[
np
.
ndarray
],
list
[
list
[
int
]
],
dict
[
str
,
LogprobsTensors
|
None
],
dict
[
str
,
LogprobsTensors
|
None
],
list
[
str
],
list
[
str
],
dict
[
str
,
int
],
dict
[
str
,
int
],
...
@@ -2492,7 +2490,6 @@ class GPUModelRunner(
...
@@ -2492,7 +2490,6 @@ class GPUModelRunner(
num_sampled_tokens
=
sampler_output
.
sampled_token_ids
.
shape
[
0
]
num_sampled_tokens
=
sampler_output
.
sampled_token_ids
.
shape
[
0
]
sampled_token_ids
=
sampler_output
.
sampled_token_ids
sampled_token_ids
=
sampler_output
.
sampled_token_ids
invalid_req_indices
=
[]
invalid_req_indices
=
[]
valid_sampled_token_ids
:
list
[
np
.
ndarray
]
if
not
self
.
use_async_scheduling
:
if
not
self
.
use_async_scheduling
:
# Get the valid generated tokens.
# Get the valid generated tokens.
max_gen_len
=
sampled_token_ids
.
shape
[
-
1
]
max_gen_len
=
sampled_token_ids
.
shape
[
-
1
]
...
@@ -2507,7 +2504,7 @@ class GPUModelRunner(
...
@@ -2507,7 +2504,7 @@ class GPUModelRunner(
)
)
# Mask out the sampled tokens that should not be sampled.
# Mask out the sampled tokens that should not be sampled.
for
i
in
discard_sampled_tokens_req_indices
:
for
i
in
discard_sampled_tokens_req_indices
:
valid_sampled_token_ids
[
int
(
i
)]
=
np
.
array
([]
)
valid_sampled_token_ids
[
int
(
i
)]
.
clear
(
)
else
:
else
:
valid_sampled_token_ids
=
[]
valid_sampled_token_ids
=
[]
invalid_req_indices
=
discard_sampled_tokens_req_indices
.
tolist
()
invalid_req_indices
=
discard_sampled_tokens_req_indices
.
tolist
()
...
@@ -2537,24 +2534,19 @@ class GPUModelRunner(
...
@@ -2537,24 +2534,19 @@ class GPUModelRunner(
[
0
]
if
spec_decode_metadata
and
logprobs_tensors
else
None
[
0
]
if
spec_decode_metadata
and
logprobs_tensors
else
None
)
)
for
req_idx
in
range
(
num_sampled_tokens
):
for
req_idx
in
range
(
num_sampled_tokens
):
sampled_ids
:
np
.
ndarray
|
None
if
self
.
use_async_scheduling
:
if
self
.
use_async_scheduling
:
sampled_ids
=
(
sampled_ids
=
[
-
1
]
if
req_idx
not
in
invalid_req_indices_set
else
None
np
.
array
([
-
1
])
if
req_idx
not
in
invalid_req_indices_set
else
None
)
else
:
else
:
sampled_ids
=
valid_sampled_token_ids
[
req_idx
]
sampled_ids
=
valid_sampled_token_ids
[
req_idx
]
num_sampled_ids
:
int
=
(
num_sampled_ids
:
int
=
len
(
sampled_ids
)
if
sampled_ids
else
0
sampled_ids
.
shape
[
0
]
if
sampled_ids
is
not
None
else
0
)
if
cu_num_accepted_tokens
is
not
None
:
if
cu_num_accepted_tokens
is
not
None
:
cu_num_accepted_tokens
.
append
(
cu_num_accepted_tokens
.
append
(
cu_num_accepted_tokens
[
-
1
]
+
num_sampled_ids
cu_num_accepted_tokens
[
-
1
]
+
num_sampled_ids
)
)
if
sampled_ids
is
None
or
num_
sampled_ids
==
0
:
if
not
sampled_ids
:
continue
continue
start_idx
=
self
.
input_batch
.
num_tokens_no_spec
[
req_idx
]
start_idx
=
self
.
input_batch
.
num_tokens_no_spec
[
req_idx
]
...
@@ -2938,9 +2930,7 @@ class GPUModelRunner(
...
@@ -2938,9 +2930,7 @@ class GPUModelRunner(
self
.
input_batch
.
prev_sampled_token_ids
=
None
self
.
input_batch
.
prev_sampled_token_ids
=
None
def
propose_draft_token_ids
(
def
propose_draft_token_ids
(
sampled_token_ids
):
sampled_token_ids
:
torch
.
Tensor
|
list
[
np
.
ndarray
],
)
->
None
:
assert
spec_decode_common_attn_metadata
is
not
None
assert
spec_decode_common_attn_metadata
is
not
None
with
record_function_or_nullcontext
(
"gpu_model_runner: draft"
):
with
record_function_or_nullcontext
(
"gpu_model_runner: draft"
):
self
.
_draft_token_ids
=
self
.
propose_draft_token_ids
(
self
.
_draft_token_ids
=
self
.
propose_draft_token_ids
(
...
@@ -3113,14 +3103,14 @@ class GPUModelRunner(
...
@@ -3113,14 +3103,14 @@ class GPUModelRunner(
def
propose_draft_token_ids
(
def
propose_draft_token_ids
(
self
,
self
,
scheduler_output
:
"SchedulerOutput"
,
scheduler_output
:
"SchedulerOutput"
,
sampled_token_ids
:
torch
.
Tensor
|
list
[
np
.
ndarray
],
sampled_token_ids
:
torch
.
Tensor
|
list
[
list
[
int
]
],
sampling_metadata
:
SamplingMetadata
,
sampling_metadata
:
SamplingMetadata
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sample_hidden_states
:
torch
.
Tensor
,
sample_hidden_states
:
torch
.
Tensor
,
aux_hidden_states
:
list
[
torch
.
Tensor
]
|
None
,
aux_hidden_states
:
list
[
torch
.
Tensor
]
|
None
,
spec_decode_metadata
:
SpecDecodeMetadata
|
None
,
spec_decode_metadata
:
SpecDecodeMetadata
|
None
,
common_attn_metadata
:
CommonAttentionMetadata
,
common_attn_metadata
:
CommonAttentionMetadata
,
)
->
torch
.
Tensor
|
list
[
list
[
int
]]:
)
->
list
[
list
[
int
]]
|
torch
.
Tensor
:
num_scheduled_tokens
=
scheduler_output
.
total_num_scheduled_tokens
num_scheduled_tokens
=
scheduler_output
.
total_num_scheduled_tokens
spec_config
=
self
.
speculative_config
spec_config
=
self
.
speculative_config
assert
spec_config
is
not
None
assert
spec_config
is
not
None
...
@@ -3154,7 +3144,7 @@ class GPUModelRunner(
...
@@ -3154,7 +3144,7 @@ class GPUModelRunner(
for
num_draft
,
tokens
in
zip
(
for
num_draft
,
tokens
in
zip
(
spec_decode_metadata
.
num_draft_tokens
,
sampled_token_ids
spec_decode_metadata
.
num_draft_tokens
,
sampled_token_ids
):
):
indices
.
append
(
offset
+
tokens
.
shape
[
0
]
-
1
)
indices
.
append
(
offset
+
len
(
tokens
)
-
1
)
offset
+=
num_draft
+
1
offset
+=
num_draft
+
1
indices
=
torch
.
tensor
(
indices
,
device
=
self
.
device
)
indices
=
torch
.
tensor
(
indices
,
device
=
self
.
device
)
hidden_states
=
sample_hidden_states
[
indices
]
hidden_states
=
sample_hidden_states
[
indices
]
...
@@ -5150,7 +5140,7 @@ class GPUModelRunner(
...
@@ -5150,7 +5140,7 @@ class GPUModelRunner(
return
kv_cache_spec
return
kv_cache_spec
def
_to_list
(
self
,
sampled_token_ids
:
torch
.
Tensor
)
->
list
[
np
.
ndarray
]:
def
_to_list
(
self
,
sampled_token_ids
:
torch
.
Tensor
)
->
list
[
list
[
int
]
]:
# This is a short term mitigation for issue mentioned in
# This is a short term mitigation for issue mentioned in
# https://github.com/vllm-project/vllm/issues/22754.
# https://github.com/vllm-project/vllm/issues/22754.
# `tolist` would trigger a cuda wise stream sync, which
# `tolist` would trigger a cuda wise stream sync, which
...
@@ -5163,4 +5153,4 @@ class GPUModelRunner(
...
@@ -5163,4 +5153,4 @@ class GPUModelRunner(
pinned
.
copy_
(
sampled_token_ids
,
non_blocking
=
True
)
pinned
.
copy_
(
sampled_token_ids
,
non_blocking
=
True
)
self
.
transfer_event
.
record
()
self
.
transfer_event
.
record
()
self
.
transfer_event
.
synchronize
()
self
.
transfer_event
.
synchronize
()
return
[
row
for
row
in
pinned
.
numpy
()
]
return
pinned
.
tolist
()
vllm/v1/worker/tpu_model_runner.py
View file @
30b9c677
...
@@ -1262,15 +1262,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1262,15 +1262,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
max_gen_len
=
selected_token_ids
.
shape
[
-
1
]
max_gen_len
=
selected_token_ids
.
shape
[
-
1
]
if
max_gen_len
==
1
:
if
max_gen_len
==
1
:
valid_sampled_token_ids
:
list
[
np
.
ndarray
]
=
[
valid_sampled_token_ids
=
selected_token_ids
.
tolist
()
row
for
row
in
selected_token_ids
.
numpy
()
]
# Mask out the sampled tokens that should not be sampled.
# Mask out the sampled tokens that should not be sampled.
# TODO: Keep in sync with gpu_model_runner.py, in particular
# TODO: Keep in sync with gpu_model_runner.py, in particular
# the "else" case here
# the "else" case here
for
i
in
discard_sampled_tokens_req_indices
:
for
i
in
discard_sampled_tokens_req_indices
:
valid_sampled_token_ids
[
i
]
=
np
.
array
([]
)
valid_sampled_token_ids
[
i
]
.
clear
(
)
# Append sampled tokens
# Append sampled tokens
for
i
,
req_state
,
seq_len
in
request_seq_lens
:
for
i
,
req_state
,
seq_len
in
request_seq_lens
:
...
@@ -1283,7 +1281,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1283,7 +1281,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
valid_mask
=
selected_token_ids
!=
INVALID_TOKEN_ID
valid_mask
=
selected_token_ids
!=
INVALID_TOKEN_ID
gen_lens
=
valid_mask
.
sum
(
dim
=
1
).
tolist
()
gen_lens
=
valid_mask
.
sum
(
dim
=
1
).
tolist
()
valid_sampled_token_ids
=
[
valid_sampled_token_ids
=
[
seq
.
numpy
()
for
seq
in
selected_token_ids
[
valid_mask
].
split
(
gen_lens
)
seq
.
tolist
()
for
seq
in
selected_token_ids
[
valid_mask
].
split
(
gen_lens
)
]
]
self
.
input_batch
.
num_tokens
[:
num_reqs
]
+=
gen_lens
self
.
input_batch
.
num_tokens
[:
num_reqs
]
+=
gen_lens
for
i
,
req_state
,
seq_len
in
request_seq_lens
:
for
i
,
req_state
,
seq_len
in
request_seq_lens
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment