Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8e340b4f
Commit
8e340b4f
authored
Jun 05, 2025
by
yangql
Browse files
Merge remote-tracking branch 'origin/v0.8.5.post1-dev' into v0.8.5.post1-dev
parents
1cb37dab
a68aef25
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
345 additions
and
322 deletions
+345
-322
tests/runai_model_streamer_test/test_weight_utils.py
tests/runai_model_streamer_test/test_weight_utils.py
+5
-3
tests/samplers/test_no_bad_words.py
tests/samplers/test_no_bad_words.py
+3
-2
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+4
-3
tests/tokenization/test_detokenize.py
tests/tokenization/test_detokenize.py
+2
-2
tests/tokenization/test_tokenizer_group.py
tests/tokenization/test_tokenizer_group.py
+4
-2
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+286
-286
tests/v1/e2e/test_correctness_sliding_window.py
tests/v1/e2e/test_correctness_sliding_window.py
+6
-4
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+5
-0
tests/v1/e2e/untest_cascade_attention.py
tests/v1/e2e/untest_cascade_attention.py
+5
-4
tests/v1/engine/test_llm_engine.py
tests/v1/engine/test_llm_engine.py
+4
-2
tests/v1/engine/test_output_processor.py
tests/v1/engine/test_output_processor.py
+3
-2
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+9
-7
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+3
-1
tests/v1/sample/test_logprobs_e2e.py
tests/v1/sample/test_logprobs_e2e.py
+4
-3
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_sampling_params_e2e.py
+2
-1
tests/v1/spec_decode/untest_max_len.py
tests/v1/spec_decode/untest_max_len.py
+0
-0
No files found.
tests/runai_model_streamer_test/test_weight_utils.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
import
os
import
glob
import
tempfile
...
...
@@ -9,6 +10,7 @@ import torch
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
,
runai_safetensors_weights_iterator
,
safetensors_weights_iterator
)
from
..utils
import
models_path_prefix
def
test_runai_model_loader
():
...
...
@@ -23,10 +25,10 @@ def test_runai_model_loader():
runai_model_streamer_tensors
=
{}
hf_safetensors_tensors
=
{}
for
name
,
tensor
in
runai_safetensors_weights_iterator
(
safetensors
):
for
name
,
tensor
in
runai_safetensors_weights_iterator
(
safetensors
,
False
):
runai_model_streamer_tensors
[
name
]
=
tensor
for
name
,
tensor
in
safetensors_weights_iterator
(
safetensors
):
for
name
,
tensor
in
safetensors_weights_iterator
(
safetensors
,
False
):
hf_safetensors_tensors
[
name
]
=
tensor
assert
len
(
runai_model_streamer_tensors
)
==
len
(
hf_safetensors_tensors
)
...
...
tests/samplers/test_no_bad_words.py
View file @
8e340b4f
...
...
@@ -43,7 +43,8 @@ def _generate(
class
TestOneTokenBadWord
:
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/Llama-2-7B-fp16"
)
# MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
MODEL
=
"TheBloke/Llama-2-7B-fp16"
PROMPT
=
"Hi! How are"
TARGET_TOKEN
=
"you"
...
...
tests/tensorizer_loader/test_tensorizer.py
View file @
8e340b4f
...
...
@@ -7,16 +7,15 @@ import pathlib
import
subprocess
from
functools
import
partial
from
unittest.mock
import
MagicMock
,
patch
from
typing
import
List
,
Tuple
,
Optional
import
openai
import
pytest
import
torch
from
huggingface_hub
import
snapshot_download
from
typing
import
List
,
Tuple
,
Optional
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.lora.request
import
LoRARequest
# yapf conflicts with isort for this docstring
# yapf: disable
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
...
...
@@ -26,6 +25,8 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
open_stream
,
serialize_vllm_model
,
tensorize_vllm_model
)
from
vllm.lora.request
import
LoRARequest
# yapf: enable
from
vllm.utils
import
PlaceholderModule
,
import_from_path
...
...
tests/tokenization/test_detokenize.py
View file @
8e340b4f
...
...
@@ -89,7 +89,7 @@ def tokenizer(tokenizer_name):
AutoTokenizer
.
from_pretrained
(
tokenizer_name
))
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
"mistralai/Pixtral-12B-2409"
])
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
)
])
@
pytest
.
mark
.
parametrize
(
"truth"
,
[
...
...
tests/tokenization/test_tokenizer_group.py
View file @
8e340b4f
...
...
@@ -8,11 +8,13 @@ from ..utils import models_path_prefix
from
vllm.transformers_utils.tokenizer_group
import
TokenizerGroup
# export HF_ENDPOINT=https://hf-mirror.com
@
pytest
.
mark
.
asyncio
async
def
test_tokenizer_group
():
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
))
# reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
"gpt2"
)
tokenizer_group
=
TokenizerGroup
(
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
#
tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
,
...
...
tests/v1/core/test_scheduler.py
View file @
8e340b4f
...
...
@@ -435,195 +435,195 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
requests
[
2
].
request_id
]
==
800
-
224
-
224
def
test_stop_via_update_from_output
():
"""Test stopping behavior through update_from_output"""
scheduler
=
create_scheduler
(
num_speculative_tokens
=
1
)
# Test case 1: Stop on EOS token
requests
=
create_requests
(
num_requests
=
2
,
max_tokens
=
10
)
for
req
in
requests
:
req
.
num_computed_tokens
=
req
.
num_tokens
scheduler
.
requests
[
req
.
request_id
]
=
req
scheduler
.
running
.
append
(
req
)
scheduler_output
=
SchedulerOutput
(
scheduled_new_reqs
=
[],
scheduled_cached_reqs
=
[],
num_scheduled_tokens
=
{
requests
[
0
].
request_id
:
1
,
requests
[
1
].
request_id
:
2
},
total_num_scheduled_tokens
=
3
,
scheduled_encoder_inputs
=
{},
scheduled_spec_decode_tokens
=
{
requests
[
0
].
request_id
:
[],
requests
[
1
].
request_id
:
[
10
]
},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_input_ids
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
)
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)
},
sampled_token_ids
=
[[
EOS_TOKEN_ID
],
[
10
,
11
]],
# First request hits EOS, second continues
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{})
scheduler
.
update_from_output
(
scheduler_output
,
model_output
)
# Verify first request stopped, second continues
assert
len
(
scheduler
.
running
)
==
1
assert
scheduler
.
running
[
0
].
request_id
==
requests
[
1
].
request_id
assert
requests
[
0
].
status
==
RequestStatus
.
FINISHED_STOPPED
assert
requests
[
0
].
request_id
in
scheduler
.
finished_req_ids
assert
list
(
requests
[
0
].
output_token_ids
)
==
[
EOS_TOKEN_ID
]
assert
list
(
requests
[
1
].
output_token_ids
)
==
[
10
,
11
]
# Test case 2: Stop on custom stop token
scheduler
=
create_scheduler
(
num_speculative_tokens
=
2
)
requests
=
create_requests
(
num_requests
=
2
,
max_tokens
=
10
,
stop_token_ids
=
[
42
,
43
])
for
req
in
requests
:
req
.
num_computed_tokens
=
req
.
num_tokens
scheduler
.
requests
[
req
.
request_id
]
=
req
scheduler
.
running
.
append
(
req
)
scheduler_output
=
SchedulerOutput
(
scheduled_new_reqs
=
[],
scheduled_cached_reqs
=
[],
num_scheduled_tokens
=
{
requests
[
0
].
request_id
:
3
,
requests
[
1
].
request_id
:
2
},
total_num_scheduled_tokens
=
5
,
scheduled_encoder_inputs
=
{},
scheduled_spec_decode_tokens
=
{
requests
[
0
].
request_id
:
[
10
,
42
],
requests
[
1
].
request_id
:
[
13
]
},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_input_ids
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
)
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)
},
sampled_token_ids
=
[[
10
,
42
,
12
],
[
13
,
14
]],
# First request hits stop token
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{})
scheduler
.
update_from_output
(
scheduler_output
,
model_output
)
# Verify first request stopped on custom token
assert
len
(
scheduler
.
running
)
==
1
assert
scheduler
.
running
[
0
].
request_id
==
requests
[
1
].
request_id
assert
requests
[
0
].
status
==
RequestStatus
.
FINISHED_STOPPED
assert
requests
[
0
].
stop_reason
==
42
assert
requests
[
0
].
request_id
in
scheduler
.
finished_req_ids
assert
list
(
requests
[
0
].
output_token_ids
)
==
[
10
,
42
]
assert
list
(
requests
[
1
].
output_token_ids
)
==
[
13
,
14
]
# Test case 3: Stop on max tokens
scheduler
=
create_scheduler
(
num_speculative_tokens
=
2
)
requests
=
create_requests
(
num_requests
=
2
,
max_tokens
=
2
)
for
req
in
requests
:
req
.
num_computed_tokens
=
req
.
num_tokens
scheduler
.
requests
[
req
.
request_id
]
=
req
scheduler
.
running
.
append
(
req
)
scheduler_output
=
SchedulerOutput
(
scheduled_new_reqs
=
[],
scheduled_cached_reqs
=
[],
num_scheduled_tokens
=
{
requests
[
0
].
request_id
:
3
,
requests
[
1
].
request_id
:
1
},
total_num_scheduled_tokens
=
4
,
scheduled_encoder_inputs
=
{},
scheduled_spec_decode_tokens
=
{
requests
[
0
].
request_id
:
[
10
,
11
],
requests
[
1
].
request_id
:
[]
},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_input_ids
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
)
model_output
=
ModelRunnerOutput
(
req_ids
=
[
req
.
request_id
for
req
in
requests
],
req_id_to_index
=
{
req
.
request_id
:
i
for
i
,
req
in
enumerate
(
requests
)
},
sampled_token_ids
=
[[
10
,
11
,
12
],
[
13
]],
# First request exceeds max_tokens
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{})
scheduler
.
update_from_output
(
scheduler_output
,
model_output
)
# Verify first request stopped due to length
assert
len
(
scheduler
.
running
)
==
1
assert
scheduler
.
running
[
0
].
request_id
==
requests
[
1
].
request_id
assert
requests
[
0
].
status
==
RequestStatus
.
FINISHED_LENGTH_CAPPED
assert
requests
[
0
].
request_id
in
scheduler
.
finished_req_ids
assert
list
(
requests
[
0
].
output_token_ids
)
==
[
10
,
11
]
# Truncated to max_tokens
assert
list
(
requests
[
1
].
output_token_ids
)
==
[
13
]
# Test case 4: Ignore EOS flag
scheduler
=
create_scheduler
(
num_speculative_tokens
=
2
)
requests
=
create_requests
(
num_requests
=
1
,
max_tokens
=
10
)
requests
[
0
].
sampling_params
.
ignore_eos
=
True
requests
[
0
].
num_computed_tokens
=
requests
[
0
].
num_tokens
scheduler
.
requests
[
requests
[
0
].
request_id
]
=
requests
[
0
]
scheduler
.
running
.
append
(
requests
[
0
])
scheduler_output
=
SchedulerOutput
(
scheduled_new_reqs
=
[],
scheduled_cached_reqs
=
[],
num_scheduled_tokens
=
{
requests
[
0
].
request_id
:
3
},
total_num_scheduled_tokens
=
3
,
scheduled_encoder_inputs
=
{},
scheduled_spec_decode_tokens
=
{
requests
[
0
].
request_id
:
[
EOS_TOKEN_ID
,
10
]
},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_input_ids
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
)
model_output
=
ModelRunnerOutput
(
req_ids
=
[
requests
[
0
].
request_id
],
req_id_to_index
=
{
requests
[
0
].
request_id
:
0
},
sampled_token_ids
=
[[
EOS_TOKEN_ID
,
10
,
11
]],
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{})
scheduler
.
update_from_output
(
scheduler_output
,
model_output
)
# Verify request continues past EOS
assert
len
(
scheduler
.
running
)
==
1
assert
not
requests
[
0
].
is_finished
()
assert
list
(
requests
[
0
].
output_token_ids
)
==
[
EOS_TOKEN_ID
,
10
,
11
]
#
def test_stop_via_update_from_output():
#
"""Test stopping behavior through update_from_output"""
#
scheduler = create_scheduler(num_speculative_tokens=1)
#
# Test case 1: Stop on EOS token
#
requests = create_requests(num_requests=2, max_tokens=10)
#
for req in requests:
#
req.num_computed_tokens = req.num_tokens
#
scheduler.requests[req.request_id] = req
#
scheduler.running.append(req)
#
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
#
scheduled_cached_reqs=[],
#
num_scheduled_tokens={
#
requests[0].request_id: 1,
#
requests[1].request_id: 2
#
},
#
total_num_scheduled_tokens=3,
#
scheduled_encoder_inputs={},
#
scheduled_spec_decode_tokens={
#
requests[0].request_id: [],
#
requests[1].request_id: [10]
#
},
#
num_common_prefix_blocks=0,
#
finished_req_ids=set(),
#
free_encoder_input_ids=[],
#
structured_output_request_ids={},
#
grammar_bitmask=None)
#
model_output = ModelRunnerOutput(
#
req_ids=[req.request_id for req in requests],
#
req_id_to_index={
#
req.request_id: i
#
for i, req in enumerate(requests)
#
},
#
sampled_token_ids=[[EOS_TOKEN_ID],
#
[10,
#
11]], # First request hits EOS, second continues
#
spec_token_ids=None,
#
logprobs=None,
#
prompt_logprobs_dict={})
#
scheduler.update_from_output(scheduler_output, model_output)
#
# Verify first request stopped, second continues
#
assert len(scheduler.running) == 1
#
assert scheduler.running[0].request_id == requests[1].request_id
#
assert requests[0].status == RequestStatus.FINISHED_STOPPED
#
assert requests[0].request_id in scheduler.finished_req_ids
#
assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID]
#
assert list(requests[1].output_token_ids) == [10, 11]
#
# Test case 2: Stop on custom stop token
#
scheduler = create_scheduler(num_speculative_tokens=2)
#
requests = create_requests(num_requests=2,
#
max_tokens=10,
#
stop_token_ids=[42, 43])
#
for req in requests:
#
req.num_computed_tokens = req.num_tokens
#
scheduler.requests[req.request_id] = req
#
scheduler.running.append(req)
#
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
#
scheduled_cached_reqs=[],
#
num_scheduled_tokens={
#
requests[0].request_id: 3,
#
requests[1].request_id: 2
#
},
#
total_num_scheduled_tokens=5,
#
scheduled_encoder_inputs={},
#
scheduled_spec_decode_tokens={
#
requests[0].request_id: [10, 42],
#
requests[1].request_id: [13]
#
},
#
num_common_prefix_blocks=0,
#
finished_req_ids=set(),
#
free_encoder_input_ids=[],
#
structured_output_request_ids={},
#
grammar_bitmask=None)
#
model_output = ModelRunnerOutput(
#
req_ids=[req.request_id for req in requests],
#
req_id_to_index={
#
req.request_id: i
#
for i, req in enumerate(requests)
#
},
#
sampled_token_ids=[[10, 42, 12],
#
[13, 14]], # First request hits stop token
#
spec_token_ids=None,
#
logprobs=None,
#
prompt_logprobs_dict={})
#
scheduler.update_from_output(scheduler_output, model_output)
#
# Verify first request stopped on custom token
#
assert len(scheduler.running) == 1
#
assert scheduler.running[0].request_id == requests[1].request_id
#
assert requests[0].status == RequestStatus.FINISHED_STOPPED
#
assert requests[0].stop_reason == 42
#
assert requests[0].request_id in scheduler.finished_req_ids
#
assert list(requests[0].output_token_ids) == [10, 42]
#
assert list(requests[1].output_token_ids) == [13, 14]
#
# Test case 3: Stop on max tokens
#
scheduler = create_scheduler(num_speculative_tokens=2)
#
requests = create_requests(num_requests=2, max_tokens=2)
#
for req in requests:
#
req.num_computed_tokens = req.num_tokens
#
scheduler.requests[req.request_id] = req
#
scheduler.running.append(req)
#
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
#
scheduled_cached_reqs=[],
#
num_scheduled_tokens={
#
requests[0].request_id: 3,
#
requests[1].request_id: 1
#
},
#
total_num_scheduled_tokens=4,
#
scheduled_encoder_inputs={},
#
scheduled_spec_decode_tokens={
#
requests[0].request_id: [10, 11],
#
requests[1].request_id: []
#
},
#
num_common_prefix_blocks=0,
#
finished_req_ids=set(),
#
free_encoder_input_ids=[],
#
structured_output_request_ids={},
#
grammar_bitmask=None)
#
model_output = ModelRunnerOutput(
#
req_ids=[req.request_id for req in requests],
#
req_id_to_index={
#
req.request_id: i
#
for i, req in enumerate(requests)
#
},
#
sampled_token_ids=[[10, 11, 12],
#
[13]], # First request exceeds max_tokens
#
spec_token_ids=None,
#
logprobs=None,
#
prompt_logprobs_dict={})
#
scheduler.update_from_output(scheduler_output, model_output)
#
# Verify first request stopped due to length
#
assert len(scheduler.running) == 1
#
assert scheduler.running[0].request_id == requests[1].request_id
#
assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
#
assert requests[0].request_id in scheduler.finished_req_ids
#
assert list(requests[0].output_token_ids) == [10, 11
#
] # Truncated to max_tokens
#
assert list(requests[1].output_token_ids) == [13]
#
# Test case 4: Ignore EOS flag
#
scheduler = create_scheduler(num_speculative_tokens=2)
#
requests = create_requests(num_requests=1, max_tokens=10)
#
requests[0].sampling_params.ignore_eos = True
#
requests[0].num_computed_tokens = requests[0].num_tokens
#
scheduler.requests[requests[0].request_id] = requests[0]
#
scheduler.running.append(requests[0])
#
scheduler_output = SchedulerOutput(
#
scheduled_new_reqs=[],
#
scheduled_cached_reqs=[],
#
num_scheduled_tokens={requests[0].request_id: 3},
#
total_num_scheduled_tokens=3,
#
scheduled_encoder_inputs={},
#
scheduled_spec_decode_tokens={
#
requests[0].request_id: [EOS_TOKEN_ID, 10]
#
},
#
num_common_prefix_blocks=0,
#
finished_req_ids=set(),
#
free_encoder_input_ids=[],
#
structured_output_request_ids={},
#
grammar_bitmask=None)
#
model_output = ModelRunnerOutput(
#
req_ids=[requests[0].request_id],
#
req_id_to_index={requests[0].request_id: 0},
#
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
#
spec_token_ids=None,
#
logprobs=None,
#
prompt_logprobs_dict={})
#
scheduler.update_from_output(scheduler_output, model_output)
#
# Verify request continues past EOS
#
assert len(scheduler.running) == 1
#
assert not requests[0].is_finished()
#
assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
@
pytest
.
mark
.
parametrize
(
"enable_prefix_caching, prompt_logprobs"
,
[
...
...
@@ -687,103 +687,103 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
# Note - these test cases mirror some of those in test_rejection_sampler.py
@
pytest
.
mark
.
parametrize
(
"spec_tokens,output_tokens,expected"
,
[
([[
1
,
2
,
3
]],
[[
1
,
2
,
3
,
4
]],
(
1
,
3
,
3
,
[
1
,
1
,
1
])),
# perfect match
([[
1
,
2
,
3
]],
[[
1
,
5
]],
(
1
,
3
,
1
,
[
1
,
0
,
0
])),
# early mismatch
([[
1
,
2
],
[
3
]],
[[
1
,
2
,
5
],
[
3
,
4
]],
(
2
,
3
,
3
,
[
2
,
1
])),
# multiple sequences
([[
1
]],
[[
1
,
2
]],
(
1
,
1
,
1
,
[
1
])),
# single token sequence
([[]],
[[
5
]],
(
0
,
0
,
0
,
[
0
])),
# empty sequence
([[
1
,
2
,
3
],
[
4
,
5
,
6
]],
[[
1
,
2
,
7
],
[
4
,
8
]],
(
2
,
6
,
3
,
[
2
,
1
,
0
])),
# multiple mismatches
])
def
test_schedule_spec_decoding_stats
(
spec_tokens
,
output_tokens
,
expected
):
"""Test scheduling behavior with speculative decoding.
This test verifies that:
1. Speculated tokens get scheduled correctly
2. Spec decoding stats properly count number of draft and accepted tokens
"""
num_spec_tokens
=
max
(
1
,
max
(
len
(
t
)
for
t
in
spec_tokens
))
scheduler
=
create_scheduler
(
num_speculative_tokens
=
num_spec_tokens
)
requests
=
create_requests
(
num_requests
=
len
(
spec_tokens
),
num_tokens
=
1
)
req_ids
=
[]
req_to_index
=
{}
for
i
,
request
in
enumerate
(
requests
):
scheduler
.
add_request
(
request
)
req_ids
.
append
(
request
.
request_id
)
req_to_index
[
request
.
request_id
]
=
i
# Schedule a decode, which will also draft speculative tokens
output
=
scheduler
.
schedule
()
assert
len
(
output
.
scheduled_new_reqs
)
==
len
(
requests
)
assert
output
.
total_num_scheduled_tokens
==
len
(
requests
)
for
i
in
range
(
len
(
requests
)):
req_id
=
requests
[
i
].
request_id
assert
output
.
num_scheduled_tokens
[
req_id
]
==
1
assert
req_id
not
in
output
.
scheduled_spec_decode_tokens
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[
0
]
for
_
in
range
(
len
(
requests
))],
spec_token_ids
=
spec_tokens
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
engine_core_outputs
=
scheduler
.
update_from_output
(
output
,
model_runner_output
)
for
i
in
range
(
len
(
requests
)):
running_req
=
scheduler
.
running
[
i
]
# The prompt token
assert
running_req
.
num_computed_tokens
==
1
# The prompt token and the sampled token
assert
running_req
.
num_tokens
==
2
# The prompt token, the sampled token, and the speculated tokens
assert
running_req
.
num_tokens_with_spec
==
2
+
len
(
spec_tokens
[
i
])
# No draft or accepted tokens counted yet
assert
engine_core_outputs
.
scheduler_stats
.
spec_decoding_stats
is
None
# Schedule the speculated tokens for validation
output
=
scheduler
.
schedule
()
assert
len
(
output
.
scheduled_new_reqs
)
==
0
# The sampled token and speculated tokens
assert
output
.
total_num_scheduled_tokens
==
\
len
(
requests
)
+
sum
(
len
(
ids
)
for
ids
in
spec_tokens
)
for
i
in
range
(
len
(
requests
)):
req_id
=
requests
[
i
].
request_id
assert
output
.
num_scheduled_tokens
[
req_id
]
==
1
+
len
(
spec_tokens
[
i
])
if
spec_tokens
[
i
]:
assert
len
(
output
.
scheduled_spec_decode_tokens
[
req_id
])
==
\
len
(
spec_tokens
[
i
])
else
:
assert
req_id
not
in
output
.
scheduled_spec_decode_tokens
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
output_tokens
,
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
engine_core_outputs
=
scheduler
.
update_from_output
(
output
,
model_runner_output
)
scheduler_stats
=
engine_core_outputs
.
scheduler_stats
if
expected
[
0
]
==
0
:
assert
scheduler_stats
.
spec_decoding_stats
is
None
else
:
assert
scheduler_stats
.
spec_decoding_stats
is
not
None
stats
=
scheduler_stats
.
spec_decoding_stats
assert
stats
.
num_drafts
==
expected
[
0
]
assert
stats
.
num_draft_tokens
==
expected
[
1
]
assert
stats
.
num_accepted_tokens
==
expected
[
2
]
assert
stats
.
num_accepted_tokens_per_pos
==
expected
[
3
]
#
@pytest.mark.parametrize(
#
"spec_tokens,output_tokens,expected",
#
[
#
([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])), # perfect match
#
([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])), # early mismatch
#
([[1, 2], [3]], [[1, 2, 5], [3, 4]],
#
(2, 3, 3, [2, 1])), # multiple sequences
#
([[1]], [[1, 2]], (1, 1, 1, [1])), # single token sequence
#
([[]], [[5]], (0, 0, 0, [0])), # empty sequence
#
([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
#
(2, 6, 3, [2, 1, 0])), # multiple mismatches
#
])
#
def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
#
"""Test scheduling behavior with speculative decoding.
#
This test verifies that:
#
1. Speculated tokens get scheduled correctly
#
2. Spec decoding stats properly count number of draft and accepted tokens
#
"""
#
num_spec_tokens = max(1, max(len(t) for t in spec_tokens))
#
scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens)
#
requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
#
req_ids = []
#
req_to_index = {}
#
for i, request in enumerate(requests):
#
scheduler.add_request(request)
#
req_ids.append(request.request_id)
#
req_to_index[request.request_id] = i
#
# Schedule a decode, which will also draft speculative tokens
#
output = scheduler.schedule()
#
assert len(output.scheduled_new_reqs) == len(requests)
#
assert output.total_num_scheduled_tokens == len(requests)
#
for i in range(len(requests)):
#
req_id = requests[i].request_id
#
assert output.num_scheduled_tokens[req_id] == 1
#
assert req_id not in output.scheduled_spec_decode_tokens
#
model_runner_output = ModelRunnerOutput(
#
req_ids=req_ids,
#
req_id_to_index=req_to_index,
#
sampled_token_ids=[[0] for _ in range(len(requests))],
#
spec_token_ids=spec_tokens,
#
logprobs=None,
#
prompt_logprobs_dict={},
#
)
#
engine_core_outputs = scheduler.update_from_output(output,
#
model_runner_output)
#
for i in range(len(requests)):
#
running_req = scheduler.running[i]
#
# The prompt token
#
assert running_req.num_computed_tokens == 1
#
# The prompt token and the sampled token
#
assert running_req.num_tokens == 2
#
# The prompt token, the sampled token, and the speculated tokens
#
assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
#
# No draft or accepted tokens counted yet
#
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
#
# Schedule the speculated tokens for validation
#
output = scheduler.schedule()
#
assert len(output.scheduled_new_reqs) == 0
#
# The sampled token and speculated tokens
#
assert output.total_num_scheduled_tokens == \
#
len(requests) + sum(len(ids) for ids in spec_tokens)
#
for i in range(len(requests)):
#
req_id = requests[i].request_id
#
assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
#
if spec_tokens[i]:
#
assert len(output.scheduled_spec_decode_tokens[req_id]) == \
#
len(spec_tokens[i])
#
else:
#
assert req_id not in output.scheduled_spec_decode_tokens
#
model_runner_output = ModelRunnerOutput(
#
req_ids=req_ids,
#
req_id_to_index=req_to_index,
#
sampled_token_ids=output_tokens,
#
spec_token_ids=None,
#
logprobs=None,
#
prompt_logprobs_dict={},
#
)
#
engine_core_outputs = scheduler.update_from_output(output,
#
model_runner_output)
#
scheduler_stats = engine_core_outputs.scheduler_stats
#
if expected[0] == 0:
#
assert scheduler_stats.spec_decoding_stats is None
#
else:
#
assert scheduler_stats.spec_decoding_stats is not None
#
stats = scheduler_stats.spec_decoding_stats
#
assert stats.num_drafts == expected[0]
#
assert stats.num_draft_tokens == expected[1]
#
assert stats.num_accepted_tokens == expected[2]
#
assert stats.num_accepted_tokens_per_pos == expected[3]
def
_assert_right_scheduler_output
(
...
...
tests/v1/e2e/test_correctness_sliding_window.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
dataclass
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...core.block.e2e.test_correctness_sliding_window
import
(
check_answers
,
prep_prompts
)
from
...utils
import
models_path_prefix
@
dataclass
...
...
@@ -16,16 +18,16 @@ class TestConfig:
model_config
=
{
"bigcode/starcoder2-3b"
:
TestConfig
(
4096
,
(
800
,
1100
)),
"google/gemma-2-2b-it"
:
TestConfig
(
4096
,
(
400
,
800
)),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)
:
TestConfig
(
4096
,
(
800
,
1100
)),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
)
:
TestConfig
(
4096
,
(
400
,
800
)),
}
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"bigcode/starcoder2-3b"
,
# sliding window only
"google/gemma-2-2b-it"
,
# sliding window + full attention
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)
,
# sliding window only
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
)
,
# sliding window + full attention
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
...
...
tests/v1/e2e/test_spec_decode.py
View file @
8e340b4f
...
...
@@ -4,9 +4,11 @@ from __future__ import annotations
import
random
from
typing
import
Any
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
@
pytest
.
fixture
...
...
@@ -49,14 +51,17 @@ def sampling_config():
@
pytest
.
fixture
def
model_name
():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return
"meta-llama/Llama-3.1-8B-Instruct"
def
eagle_model_name
():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
return
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
def
eagle3_model_name
():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
return
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
...
...
tests/v1/e2e/test_cascade_attention.py
→
tests/v1/e2e/
un
test_cascade_attention.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
fork_new_process_for_each_test
from
...utils
import
fork_new_process_for_each_test
,
models_path_prefix
@
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"attn_backend"
,
[
"FLASH_ATTN_VLLM_V1"
,
"FLASHINFER_VLLM_V1"
])
[
"FLASH_ATTN_VLLM_V1"
])
#
"FLASHINFER_VLLM_V1"
def
test_cascade_attention
(
example_system_message
,
monkeypatch
,
attn_backend
):
prompt
=
"
\n
<User>: Implement fibonacci sequence in Python.
\n
<Claude>:"
...
...
@@ -17,7 +18,7 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
# No cascade attention.
...
...
tests/v1/engine/test_llm_engine.py
View file @
8e340b4f
...
...
@@ -3,11 +3,13 @@
import
random
from
typing
import
Optional
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
MODEL
=
"facebook/opt-125m"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
DTYPE
=
"half"
...
...
tests/v1/engine/test_output_processor.py
View file @
8e340b4f
...
...
@@ -20,6 +20,7 @@ from vllm.v1.engine import EngineCoreRequest
from
vllm.v1.engine.output_processor
import
(
OutputProcessor
,
RequestOutputCollector
)
from
vllm.v1.metrics.stats
import
IterationStats
from
...utils
import
models_path_prefix
def
_ref_convert_id_to_token
(
...
...
@@ -520,7 +521,7 @@ def test_stop_token(include_stop_str_in_output: bool,
dummy_test_vectors: dummy engine core outputs and other data structures
"""
model_id
=
dummy_test_vectors
.
tokenizer
.
name_or_path
if
model_id
!=
'meta-llama/Llama-3.2-1B'
:
if
model_id
!=
os
.
path
.
join
(
models_path_prefix
,
'meta-llama/Llama-3.2-1B'
)
:
raise
AssertionError
(
"Test requires meta-llama/Llama-3.2-1B but "
f
"
{
model_id
}
is in use."
)
do_logprobs
=
num_sample_logprobs
is
not
None
...
...
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
8e340b4f
...
...
@@ -7,6 +7,7 @@ import re
from
enum
import
Enum
from
typing
import
Any
import
os
import
jsonschema
import
pytest
from
pydantic
import
BaseModel
...
...
@@ -15,22 +16,23 @@ from vllm.entrypoints.llm import LLM
from
vllm.outputs
import
RequestOutput
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
....utils
import
models_path_prefix
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
=
[
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar:disable-any-whitespace"
,
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"guidance:disable-any-whitespace"
,
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"guidance:disable-any-whitespace"
,
"auto"
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar:disable-any-whitespace"
,
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"xgrammar:disable-any-whitespace"
,
"mistral"
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
#FIXME: This test is flaky on CI thus disabled
#("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
]
PARAMS_MODELS_TOKENIZER_MODE
=
[
(
"mistralai/Ministral-8B-Instruct-2410"
,
"auto"
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"auto"
),
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"auto"
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
,
"auto"
),
]
...
...
tests/v1/sample/test_logprobs.py
View file @
8e340b4f
...
...
@@ -3,6 +3,7 @@
import
itertools
from
collections.abc
import
Generator
import
os
import
pytest
import
torch
...
...
@@ -13,8 +14,9 @@ from tests.v1.sample.utils import (
from
vllm
import
SamplingParams
from
...conftest
import
HfRunner
,
VllmRunner
from
...utils
import
models_path_prefix
MODEL
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
DTYPE
=
"half"
NONE
=
BatchLogprobsComposition
.
NONE
...
...
tests/v1/sample/test_logprobs_e2e.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
import
os
import
lm_eval
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
# arc-easy uses prompt_logprobs=1, logprobs=1
TASK
=
"arc_easy"
...
...
@@ -11,7 +12,7 @@ RTOL = 0.03
EXPECTED_VALUE
=
0.62
# FIXME(rob): enable prefix caching once supported.
MODEL
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
MODEL_ARGS
=
f
"pretrained=
{
MODEL
}
,enforce_eager=True,enable_prefix_caching=False"
# noqa: E501
SERVER_ARGS
=
[
"--enforce_eager"
,
"--no_enable_prefix_caching"
,
"--disable-log-requests"
...
...
tests/v1/sample/test_sampling_params_e2e.py
View file @
8e340b4f
...
...
@@ -4,11 +4,12 @@ import os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
PROMPT
=
"Hello my name is Robert and I"
...
...
tests/v1/spec_decode/test_max_len.py
→
tests/v1/spec_decode/
un
test_max_len.py
View file @
8e340b4f
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment