Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dd572c0a
Unverified
Commit
dd572c0a
authored
Jul 18, 2025
by
Woosuk Kwon
Committed by
GitHub
Jul 18, 2025
Browse files
[V0 Deprecation] Remove V0 Spec Decode workers (#21152)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
9ffe905a
Changes
73
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
145 additions
and
2631 deletions
+145
-2631
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+0
-14
.github/CODEOWNERS
.github/CODEOWNERS
+0
-1
.github/mergify.yml
.github/mergify.yml
+0
-3
pyproject.toml
pyproject.toml
+0
-1
tests/core/test_serialization.py
tests/core/test_serialization.py
+1
-1
tests/core/utils.py
tests/core/utils.py
+131
-3
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+0
-146
tests/models/registry.py
tests/models/registry.py
+4
-4
tests/models/test_registry.py
tests/models/test_registry.py
+9
-5
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+0
-577
tests/samplers/test_typical_acceptance_sampler.py
tests/samplers/test_typical_acceptance_sampler.py
+0
-480
tests/spec_decode/__init__.py
tests/spec_decode/__init__.py
+0
-0
tests/spec_decode/conftest.py
tests/spec_decode/conftest.py
+0
-12
tests/spec_decode/e2e/__init__.py
tests/spec_decode/e2e/__init__.py
+0
-0
tests/spec_decode/e2e/conftest.py
tests/spec_decode/e2e/conftest.py
+0
-307
tests/spec_decode/e2e/test_compatibility.py
tests/spec_decode/e2e/test_compatibility.py
+0
-66
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+0
-480
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+0
-161
tests/spec_decode/e2e/test_integration_dist_tp2.py
tests/spec_decode/e2e/test_integration_dist_tp2.py
+0
-247
tests/spec_decode/e2e/test_integration_dist_tp4.py
tests/spec_decode/e2e/test_integration_dist_tp4.py
+0
-123
No files found.
.buildkite/test-pipeline.yaml
View file @
dd572c0a
...
@@ -159,7 +159,6 @@ steps:
...
@@ -159,7 +159,6 @@ steps:
-
tests/distributed/test_utils
-
tests/distributed/test_utils
-
tests/distributed/test_pynccl
-
tests/distributed/test_pynccl
-
tests/distributed/test_events
-
tests/distributed/test_events
-
tests/spec_decode/e2e/test_integration_dist_tp4
-
tests/compile/test_basic_correctness
-
tests/compile/test_basic_correctness
-
examples/offline_inference/rlhf.py
-
examples/offline_inference/rlhf.py
-
examples/offline_inference/rlhf_colocate.py
-
examples/offline_inference/rlhf_colocate.py
...
@@ -182,7 +181,6 @@ steps:
...
@@ -182,7 +181,6 @@ steps:
-
pytest -v -s compile/test_basic_correctness.py
-
pytest -v -s compile/test_basic_correctness.py
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s distributed/test_events.py
-
pytest -v -s distributed/test_events.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
# when we have multiple distributed example tests
-
pushd ../examples/offline_inference
-
pushd ../examples/offline_inference
...
@@ -330,17 +328,6 @@ steps:
...
@@ -330,17 +328,6 @@ steps:
-
pytest -v -s samplers
-
pytest -v -s samplers
-
VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-
VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-
label
:
Speculative decoding tests
# 40min
mirror_hardwares
:
[
amdexperimental
]
source_file_dependencies
:
-
vllm/spec_decode
-
tests/spec_decode
-
vllm/model_executor/models/eagle.py
commands
:
-
pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
-
pytest -v -s spec_decode/e2e/test_eagle_correctness.py
-
label
:
LoRA Test %N
# 15min each
-
label
:
LoRA Test %N
# 15min each
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
source_file_dependencies
:
source_file_dependencies
:
...
@@ -726,7 +713,6 @@ steps:
...
@@ -726,7 +713,6 @@ steps:
-
pytest -v -s distributed/test_sequence_parallel.py
-
pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
# this test fails consistently.
# TODO: investigate and fix
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
...
...
.github/CODEOWNERS
View file @
dd572c0a
...
@@ -43,7 +43,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
...
@@ -43,7 +43,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/multimodal @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
...
...
.github/mergify.yml
View file @
dd572c0a
...
@@ -164,10 +164,7 @@ pull_request_rules:
...
@@ -164,10 +164,7 @@ pull_request_rules:
description
:
Automatically apply speculative-decoding label
description
:
Automatically apply speculative-decoding label
conditions
:
conditions
:
-
or
:
-
or
:
-
files~=^vllm/spec_decode/
-
files~=^vllm/v1/spec_decode/
-
files~=^vllm/v1/spec_decode/
-
files=vllm/model_executor/layers/spec_decode_base_sampler.py
-
files~=^tests/spec_decode/
-
files~=^tests/v1/spec_decode/
-
files~=^tests/v1/spec_decode/
-
files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
-
files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
-
files~=^vllm/model_executor/models/.*eagle.*\.py
-
files~=^vllm/model_executor/models/.*eagle.*\.py
...
...
pyproject.toml
View file @
dd572c0a
...
@@ -73,7 +73,6 @@ line-length = 80
...
@@ -73,7 +73,6 @@ line-length = 80
"vllm/engine/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/engine/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/executor/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/executor/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/prompt_adapter/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/prompt_adapter/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/spec_decode/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/worker/**/*.py"
=
[
"UP006"
,
"UP035"
]
"vllm/worker/**/*.py"
=
[
"UP006"
,
"UP035"
]
# Python 3.8 typing - skip utils for ROCm
# Python 3.8 typing - skip utils for ROCm
"vllm/utils/__init__.py"
=
[
"UP006"
,
"UP035"
]
"vllm/utils/__init__.py"
=
[
"UP006"
,
"UP035"
]
...
...
tests/core/test_serialization.py
View file @
dd572c0a
...
@@ -6,7 +6,7 @@ import msgspec
...
@@ -6,7 +6,7 @@ import msgspec
from
vllm.executor.msgspec_utils
import
decode_hook
,
encode_hook
from
vllm.executor.msgspec_utils
import
decode_hook
,
encode_hook
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.sequence
import
ExecuteModelRequest
from
..spec_decode
.utils
import
create_batch
from
.utils
import
create_batch
def
test_msgspec_serialization
():
def
test_msgspec_serialization
():
...
...
tests/core/utils.py
View file @
dd572c0a
...
@@ -4,15 +4,16 @@
...
@@ -4,15 +4,16 @@
import
time
import
time
from
collections
import
defaultdict
from
collections
import
defaultdict
from
collections.abc
import
Sequence
as
GenericSequence
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Any
,
Optional
from
itertools
import
count
from
typing
import
Any
,
Optional
,
Union
import
torch
import
torch
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
from
vllm.inputs
import
EncoderDecoderInputs
,
embeds_inputs
,
token_inputs
from
vllm.inputs
import
EncoderDecoderInputs
,
embeds_inputs
,
token_inputs
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
(
Logprob
,
Sequence
,
SequenceGroup
,
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
Logprob
,
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
)
SequenceGroupMetadata
)
...
@@ -262,3 +263,130 @@ class SchedulerProxy:
...
@@ -262,3 +263,130 @@ class SchedulerProxy:
self
,
)
->
tuple
[
list
[
SequenceGroupMetadata
],
SchedulerOutputs
,
Any
]:
self
,
)
->
tuple
[
list
[
SequenceGroupMetadata
],
SchedulerOutputs
,
Any
]:
_
,
_
,
ret
=
self
.
call_history
[
"schedule"
][
-
1
]
_
,
_
,
ret
=
self
.
call_history
[
"schedule"
][
-
1
]
return
ret
return
ret
def
create_seq_group_metadata_from_prompts
(
prompts
:
list
[
list
[
int
]],
num_gpu_blocks
:
int
,
block_size
:
int
,
final_prompt_lens
:
list
[
int
],
continuations
:
Optional
[
list
[
list
[
int
]]]
=
None
,
seq_ids
:
Optional
[
list
[
int
]]
=
None
,
)
->
list
[
SequenceGroupMetadata
]:
if
continuations
is
None
:
continuations
=
[[]
for
_
in
prompts
]
if
seq_ids
is
None
:
seq_ids
=
list
(
i
for
i
,
_
in
enumerate
(
prompts
))
free_gpu_blocks
=
list
(
range
(
num_gpu_blocks
))
block_allocations
=
{
i
:
[
free_gpu_blocks
.
pop
()
for
_
in
range
(
round_up_to_next_block
(
final_len
,
block_size
))
]
for
i
,
final_len
in
enumerate
(
final_prompt_lens
)
}
seq_grou_metadata_list
=
[]
for
i
,
(
prompt_token_ids
,
cont_token_ids
)
in
enumerate
(
zip
(
prompts
,
continuations
)):
data
=
SequenceData
.
from_seqs
(
prompt_token_ids
,
cont_token_ids
)
data
.
update_num_computed_tokens
(
len
(
prompt_token_ids
)
+
len
(
cont_token_ids
)
-
1
)
seq_data
=
{
i
:
data
}
seq_grou_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
str
(
i
),
is_prompt
=
len
(
cont_token_ids
)
==
0
,
seq_data
=
seq_data
,
sampling_params
=
SamplingParams
(
temperature
=
0.0
),
block_tables
=
{
i
:
block_allocations
[
i
][:]},
))
return
seq_grou_metadata_list
def
create_chunked_seq_group_metadata_from_prompt
(
prompt
:
list
[
int
],
num_gpu_blocks
:
int
,
chunk_size
:
int
,
block_size
:
int
,
seq_id
:
Optional
[
int
]
=
None
)
->
list
[
SequenceGroupMetadata
]:
if
seq_id
is
None
:
seq_id
=
0
free_gpu_blocks
=
list
(
range
(
num_gpu_blocks
))
block_allocations
=
[
free_gpu_blocks
.
pop
()
for
_
in
range
(
round_up_to_next_block
(
len
(
prompt
),
block_size
))
]
seq_group_metadata_list
=
[]
for
i
,
idx
in
enumerate
(
range
(
0
,
len
(
prompt
),
chunk_size
)):
chunk_ids
=
prompt
[
idx
:
idx
+
chunk_size
]
data
=
SequenceData
.
from_seqs
(
prompt
)
data
.
update_num_computed_tokens
(
idx
)
seq_data
=
{
i
:
data
}
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
str
(
seq_id
),
is_prompt
=
True
,
do_sample
=
idx
+
chunk_size
>=
len
(
prompt
),
# terminal chunk
seq_data
=
seq_data
,
sampling_params
=
SamplingParams
(
temperature
=
0.0
),
block_tables
=
{
i
:
block_allocations
},
token_chunk_size
=
len
(
chunk_ids
)))
return
seq_group_metadata_list
def
create_batch
(
batch_size
,
k
,
prompt_len
:
Union
[
int
,
list
[
int
]]
=
10
,
prev_output_token_len
:
int
=
10
,
seq_ids
:
Optional
[
list
[
int
]]
=
None
,
num_gpu_blocks
:
Optional
[
int
]
=
None
,
block_size
:
Optional
[
int
]
=
None
,
prefill_chunk_size
:
Optional
[
int
]
=
None
):
if
block_size
is
None
:
block_size
=
8
if
num_gpu_blocks
is
None
:
num_gpu_blocks
=
2048
//
block_size
iterator
=
count
()
if
isinstance
(
prompt_len
,
int
):
prompt_lens
=
[
prompt_len
for
_
in
range
(
batch_size
)]
else
:
prompt_lens
=
prompt_len
prompts
=
[[
next
(
iterator
)
for
_
in
range
(
p_len
)]
for
p_len
in
prompt_lens
]
if
prefill_chunk_size
:
# Create a batch of chunked prompts.
if
not
seq_ids
:
seq_ids
=
list
(
range
(
len
(
prompts
)))
seq_group_metadata_list
=
[]
for
p
,
sid
in
zip
(
prompts
,
seq_ids
):
seq_group_metadata_list
+=
\
create_chunked_seq_group_metadata_from_prompt
(
p
,
num_gpu_blocks
,
prefill_chunk_size
,
block_size
,
sid
)
seq_group_metadata_list
=
seq_group_metadata_list
[:
batch_size
]
prev_output_tokens
=
[]
else
:
prev_output_tokens
=
[[
next
(
iterator
)
for
_
in
range
(
prev_output_token_len
)
]
for
_
in
range
(
batch_size
)]
final_prompt_lens
=
[
len
(
prompt
)
+
len
(
prev_output_token
)
+
k
+
1
for
prompt
,
prev_output_token
in
zip
(
prompts
,
prev_output_tokens
)
]
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
prompts
,
num_gpu_blocks
,
block_size
,
final_prompt_lens
,
prev_output_tokens
,
seq_ids
)
return
seq_group_metadata_list
,
prompts
,
prev_output_tokens
tests/metrics/test_metrics.py
View file @
dd572c0a
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
import
pytest
import
pytest
import
ray
import
ray
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm
import
EngineArgs
,
LLMEngine
from
vllm
import
EngineArgs
,
LLMEngine
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.metrics
import
RayPrometheusStatLogger
from
vllm.engine.metrics
import
RayPrometheusStatLogger
...
@@ -232,149 +229,6 @@ def test_engine_log_metrics_regression(
...
@@ -232,149 +229,6 @@ def test_engine_log_metrics_regression(
assert_metrics
(
model
,
engine
,
disable_log_stats
,
len
(
example_prompts
))
assert_metrics
(
model
,
engine
,
disable_log_stats
,
len
(
example_prompts
))
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
def
test_metric_spec_decode
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
k
=
5
with
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
,
speculative_config
=
{
"model"
:
model
,
"num_speculative_tokens"
:
k
,
},
)
as
vllm_model
:
# Force log interval to be 0 to catch all metrics.
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_loggers
[
'prometheus'
]
stat_logger
.
local_interval
=
0
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn
=
{
"gauge_spec_decode_draft_acceptance_rate"
:
lambda
v
:
0
<=
v
<=
1
,
"gauge_spec_decode_efficiency"
:
lambda
v
:
0
<=
v
<=
1
,
"counter_spec_decode_num_accepted_tokens"
:
lambda
v
:
0
<=
v
<=
k
,
"counter_spec_decode_num_draft_tokens"
:
lambda
v
:
v
==
k
,
"counter_spec_decode_num_emitted_tokens"
:
lambda
v
:
0
<=
v
<=
k
+
1
,
}
# Use one request to better inspect the metrics.
prompts
=
example_prompts
[:
1
]
_
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
for
metric_name
,
is_expected
in
metric_name_to_expected_fn
.
items
():
metric_val
=
getattr
(
stat_logger
.
metrics
,
metric_name
).
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
assert
is_expected
(
metric_val
),
(
f
"the value of metric
{
metric_name
}
(
{
metric_val
}
) "
"does not meet expectation"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"log_interval"
,
[
1
,
3
,
5
,
7
])
def
test_metric_spec_decode_interval
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
log_interval
:
int
,
)
->
None
:
k
=
5
engine_args
=
EngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
,
speculative_config
=
{
"model"
:
model
,
"num_speculative_tokens"
:
k
,
},
enforce_eager
=
True
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
try
:
engine
.
add_request
(
"request-id-0"
,
example_prompts
[
0
],
SamplingParams
(
max_tokens
=
max_tokens
),
)
# set log internal
stat_logger
=
engine
.
stat_loggers
[
'prometheus'
]
stat_logger
.
local_interval
=
log_interval
# prefill
engine
.
step
()
# wait for 5 seconds to ensure that spec decode metrics
# get triggered in first decode step
time
.
sleep
(
5
)
# first decode step should trigger async collection of metrics
engine
.
step
()
# wait one second to allow H2D transfer to finish
time
.
sleep
(
1
)
# second decode step should now be able to collect the spec
# decode stats and the request should also be finished
engine
.
step
()
# must have finisehd now
assert
not
engine
.
has_unfinished_requests
()
# wait to ensure logging occurs
time
.
sleep
(
log_interval
)
# force logging
engine
.
step
()
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn
=
{
"gauge_spec_decode_draft_acceptance_rate"
:
lambda
v
:
0
<=
v
<=
1
,
"gauge_spec_decode_efficiency"
:
lambda
v
:
0
<=
v
<=
1
,
"counter_spec_decode_num_accepted_tokens"
:
lambda
v
:
0
<=
v
<=
k
,
"counter_spec_decode_num_draft_tokens"
:
lambda
v
:
v
==
k
,
"counter_spec_decode_num_emitted_tokens"
:
lambda
v
:
0
<=
v
<=
k
+
1
,
}
for
metric_name
,
is_expected
in
metric_name_to_expected_fn
.
items
():
metric_val
=
getattr
(
stat_logger
.
metrics
,
metric_name
).
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
assert
is_expected
(
metric_val
),
(
f
"the value of metric
{
metric_name
}
(
{
metric_val
}
) "
"does not meet expectation"
)
finally
:
del
engine
cleanup_dist_env_and_memory
()
def
assert_metrics
(
model
:
str
,
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
def
assert_metrics
(
model
:
str
,
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
num_requests
:
int
)
->
None
:
num_requests
:
int
)
->
None
:
if
disable_log_stats
:
if
disable_log_stats
:
...
...
tests/models/registry.py
View file @
dd572c0a
...
@@ -457,12 +457,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -457,12 +457,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
_SPECULATIVE_DECODING_EXAMPLE_MODELS
=
{
_SPECULATIVE_DECODING_EXAMPLE_MODELS
=
{
"EAGLEModel"
:
_HfExamplesInfo
(
"JackFram/llama-68m"
,
speculative_model
=
"abhigoyal/vllm-eagle-llama-68m-random"
),
# noqa: E501
"MedusaModel"
:
_HfExamplesInfo
(
"JackFram/llama-68m"
,
"MedusaModel"
:
_HfExamplesInfo
(
"JackFram/llama-68m"
,
speculative_model
=
"abhigoyal/vllm-medusa-llama-68m-random"
),
# noqa: E501
speculative_model
=
"abhigoyal/vllm-medusa-llama-68m-random"
),
# noqa: E501
"MLPSpeculatorPreTrainedModel"
:
_HfExamplesInfo
(
"JackFram/llama-160m"
,
# Temporarily disabled.
speculative_model
=
"ibm-ai-platform/llama-160m-accelerator"
),
# noqa: E501
# TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
# speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
"luccafong/deepseek_mtp_main_random"
,
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
"luccafong/deepseek_mtp_main_random"
,
speculative_model
=
"luccafong/deepseek_mtp_draft_random"
,
# noqa: E501
speculative_model
=
"luccafong/deepseek_mtp_draft_random"
,
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
...
...
tests/models/test_registry.py
View file @
dd572c0a
...
@@ -72,11 +72,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
...
@@ -72,11 +72,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_arch,is_pp,init_cuda"
,
[
@
pytest
.
mark
.
parametrize
(
(
"MLPSpeculatorPreTrainedModel"
,
False
,
False
),
"model_arch,is_pp,init_cuda"
,
[
# TODO(woosuk): Re-enable this once the MLP Speculator is supported
# in V1.
# ("MLPSpeculatorPreTrainedModel", False, False),
(
"DeepseekV2ForCausalLM"
,
True
,
False
),
(
"DeepseekV2ForCausalLM"
,
True
,
False
),
(
"Qwen2VLForConditionalGeneration"
,
True
,
True
),
(
"Qwen2VLForConditionalGeneration"
,
True
,
True
),
])
])
def
test_registry_is_pp
(
model_arch
,
is_pp
,
init_cuda
):
def
test_registry_is_pp
(
model_arch
,
is_pp
,
init_cuda
):
assert
ModelRegistry
.
is_pp_supported_model
(
model_arch
)
is
is_pp
assert
ModelRegistry
.
is_pp_supported_model
(
model_arch
)
is
is_pp
...
...
tests/samplers/test_rejection_sampler.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for rejection sampling."""
import
pytest
import
torch
import
torch.nn.functional
as
F
from
vllm.model_executor.layers.rejection_sampler
import
RejectionSampler
from
vllm.model_executor.utils
import
set_random_seed
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
def
mock_causal_accepted_tensor
(
k
:
int
,
last_accepted_indices
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Generate an "accepted" tensor which should yield causally-accepted tokens
up to last accepted indices.
Tokens after last_accepted_indices+1 may also be accepted, although they
will not be causally accepted.
"""
batch_size
=
last_accepted_indices
.
shape
[
0
]
accepted
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
<=
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
batch_size
,
k
))
# Sprinkle accepted values after the contiguous initial accepted values.
# This replicates the behavior of rejection sampling, which may "accept"
# a token that cannot be accepted because of causality.
sprinkle_candidates
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
>
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
batch_size
,
k
)
+
1
)
sprinkle
=
torch
.
rand
(
batch_size
,
k
)
>
0.5
accepted
[
sprinkle_candidates
]
=
sprinkle
[
sprinkle_candidates
]
return
accepted
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"which_tokens_accepted"
,
[
"all_tokens_accepted"
,
"no_tokens_accepted"
,
"some_tokens_accepted"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
True
,
False
])
@
torch
.
inference_mode
()
def
test_correct_output_format
(
which_tokens_accepted
:
str
,
seed
:
int
,
device
:
str
,
use_flashinfer
:
bool
):
"""Verify the output has correct format given predetermined accepted matrix.
"""
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
batch_size
=
10
k
=
5
vocab_size
=
3000
if
which_tokens_accepted
==
"all_tokens_accepted"
:
accepted
=
mock_causal_accepted_tensor
(
k
,
-
1
+
k
*
torch
.
ones
((
batch_size
,
),
dtype
=
torch
.
long
))
elif
which_tokens_accepted
==
"no_tokens_accepted"
:
accepted
=
mock_causal_accepted_tensor
(
k
,
-
torch
.
ones
((
batch_size
,
),
dtype
=
torch
.
long
))
elif
which_tokens_accepted
==
"some_tokens_accepted"
:
last_accepted_indices
=
torch
.
randint
(
low
=-
1
,
high
=
k
,
size
=
(
batch_size
,
))
accepted
=
mock_causal_accepted_tensor
(
k
,
last_accepted_indices
)
else
:
raise
AssertionError
()
recovered_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
output_token_ids
=
rejection_sampler
.
_create_output
(
# pylint: disable=protected-access
accepted
,
recovered_token_ids
,
draft_token_ids
,
bonus_token_ids
,
)
expected_bonus_token_ids
=
bonus_token_ids
.
clone
()
if
which_tokens_accepted
==
"all_tokens_accepted"
:
# Expect all tokens to be equal to draft tokens.
assert
torch
.
equal
(
output_token_ids
[:,
:
-
1
],
draft_token_ids
)
# Expect all bonus tokens to be included.
assert
torch
.
equal
(
output_token_ids
[:,
-
1
:],
expected_bonus_token_ids
)
elif
which_tokens_accepted
==
"no_tokens_accepted"
:
# Expect first token to be equal to recovered tokens.
assert
torch
.
equal
(
output_token_ids
[:,
0
],
recovered_token_ids
[:,
0
])
# Expect everything else to be -1.
assert
torch
.
equal
(
output_token_ids
[:,
1
:],
torch
.
ones_like
(
output_token_ids
[:,
1
:])
*
-
1
)
elif
which_tokens_accepted
==
"some_tokens_accepted"
:
recovered_plus_bonus
=
torch
.
cat
(
(
recovered_token_ids
,
expected_bonus_token_ids
),
dim
=-
1
)
# Assert first rejected token is a recovered token or bonus token.
assert
torch
.
equal
(
recovered_plus_bonus
[
torch
.
arange
(
0
,
batch_size
),
last_accepted_indices
+
1
],
output_token_ids
[
torch
.
arange
(
0
,
batch_size
),
last_accepted_indices
+
1
])
# Assert every subsequent token is -1.
subsequent_mask
=
torch
.
arange
(
0
,
k
+
1
).
expand
(
batch_size
,
k
+
1
)
>=
(
last_accepted_indices
+
2
).
unsqueeze
(
-
1
)
assert
torch
.
all
(
output_token_ids
[
subsequent_mask
]
==
-
1
)
@
pytest
.
mark
.
parametrize
(
"k"
,
list
(
range
(
1
,
6
)))
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
32
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
True
,
False
])
@
torch
.
inference_mode
()
def
test_no_crash_with_varying_dims
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
device
:
str
,
use_flashinfer
:
bool
):
torch
.
set_default_device
(
device
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
)
@
pytest
.
mark
.
parametrize
(
"frac_seeded"
,
[
0.0
,
0.25
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1
,
3
,
6
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"n_rep"
,
[
100
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
# @pytest.mark.parametrize("use_flashinfer", [True, False])
# Not testing FlashInfer now, since 0.2.3 API removed the ability
# to pass in uniform samples.
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
False
])
@
torch
.
inference_mode
()
def
test_deterministic_when_seeded
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
frac_seeded
:
float
,
n_rep
:
int
,
device
:
str
,
use_flashinfer
:
bool
):
torch
.
set_default_device
(
device
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
seeded_mask
=
torch
.
rand
(
batch_size
,
dtype
=
torch
.
float32
)
<=
frac_seeded
results
=
[]
for
_
in
range
(
n_rep
):
seeded_seqs
=
{
i
:
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
for
i
in
range
(
batch_size
)
if
seeded_mask
[
i
]
}
results
.
append
(
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
seeded_seqs
))
for
i
in
range
(
batch_size
):
if
seeded_mask
[
i
]:
for
j
in
range
(
1
,
n_rep
):
assert
torch
.
equal
(
results
[
j
][
i
],
results
[
0
][
i
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1
,
3
,
6
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
3
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
# @pytest.mark.parametrize("use_flashinfer", [True, False])
# Not testing FlashInfer now, since 0.2.3 API removed the ability
# to pass in uniform samples.
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
False
])
@
torch
.
inference_mode
()
def
test_mixed_seeded_batch
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
device
:
str
,
use_flashinfer
:
bool
):
torch
.
set_default_device
(
device
)
set_random_seed
(
0
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
single_batches
=
[]
for
i
in
range
(
batch_size
):
single_batches
.
append
((
draft_probs
[
i
].
clone
().
unsqueeze
(
0
),
draft_token_ids
[
i
].
clone
().
unsqueeze
(
0
),
target_probs
[
i
].
clone
().
unsqueeze
(
0
),
bonus_token_ids
[
i
].
clone
().
unsqueeze
(
0
),
draft_token_ids
[
i
].
clone
().
unsqueeze
(
0
)))
set_random_seed
(
0
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
results
=
[]
seeded_seqs
=
{
i
:
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
for
i
in
range
(
1
,
batch_size
)
# 0 is seed None
}
batch_result
=
rejection_sampler
(
target_probs
.
clone
(),
bonus_token_ids
.
clone
(),
draft_probs
.
clone
(),
draft_token_ids
.
clone
(),
seeded_seqs
)
set_random_seed
(
0
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
for
i
in
range
(
batch_size
):
request_seeded_seqs
=
{
0
:
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
}
if
seeded_seqs
.
get
(
i
)
is
not
None
else
None
(
draft_probs
,
draft_token_ids
,
target_probs
,
bonus_token_ids
,
draft_token_ids
)
=
single_batches
[
i
]
results
.
append
(
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
request_seeded_seqs
))
for
i
in
range
(
batch_size
):
assert
torch
.
equal
(
batch_result
[
i
],
results
[
i
].
squeeze
(
0
))
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1
,
3
,
6
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_compare_nonflashinfer_backend
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
device
:
str
):
"""
Test the flashinfer and nonflashinfer backend generate
the same output metrics.
"""
pytest
.
skip
(
"Not testing FlashInfer now, since 0.2.3 API removed "
"the ability to pass in uniform samples."
)
torch
.
set_default_device
(
device
)
torch
.
manual_seed
(
0
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
num_accepted_tokens
=
[]
num_emitted_tokens
=
[]
num_draft_tokens
=
[]
def
get_seeded_seqs
():
return
{
i
:
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
for
i
in
range
(
batch_size
)
}
for
use_flashinfer
in
[
True
,
False
]:
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
# We use seeded sequences to ensure the same tokens are accepted
# for both flashinfer and nonflashinfer backends.
seeded_seqs
=
get_seeded_seqs
()
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
seeded_seqs
)
num_accepted_tokens
.
append
(
rejection_sampler
.
num_accepted_tokens
)
num_emitted_tokens
.
append
(
rejection_sampler
.
num_emitted_tokens
)
num_draft_tokens
.
append
(
rejection_sampler
.
num_draft_tokens
)
assert
num_accepted_tokens
[
0
]
==
num_accepted_tokens
[
1
]
assert
num_emitted_tokens
[
0
]
==
num_emitted_tokens
[
1
]
assert
num_draft_tokens
[
0
]
==
num_draft_tokens
[
1
]
@
pytest
.
mark
.
parametrize
(
"above_or_below_vocab_range"
,
[
"above"
,
"below"
])
@
pytest
.
mark
.
parametrize
(
"which_token_ids"
,
[
"bonus_token_ids"
,
"draft_token_ids"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
True
,
False
])
@
torch
.
inference_mode
()
def
test_raises_when_vocab_oob
(
above_or_below_vocab_range
:
str
,
which_token_ids
:
str
,
device
:
str
,
use_flashinfer
:
bool
):
k
=
3
batch_size
=
5
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
,
strict_mode
=
True
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
oob_token_ids
=
None
if
which_token_ids
==
"bonus_token_ids"
:
oob_token_ids
=
bonus_token_ids
elif
which_token_ids
==
"draft_token_ids"
:
oob_token_ids
=
draft_token_ids
else
:
raise
AssertionError
()
if
above_or_below_vocab_range
==
"above"
:
rogue_token_id
=
vocab_size
+
1
elif
above_or_below_vocab_range
==
"below"
:
rogue_token_id
=
-
1
else
:
raise
AssertionError
()
oob_token_ids
[
0
][
0
]
=
rogue_token_id
with
pytest
.
raises
(
AssertionError
):
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
)
@
pytest
.
mark
.
parametrize
(
"draft_and_target_probs_equal"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
5
)))
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
True
,
False
])
@
torch
.
inference_mode
()
def
test_rejection_sampling_approximates_target_distribution
(
seed
:
int
,
draft_and_target_probs_equal
:
bool
,
use_flashinfer
:
bool
):
"""Verify rejection sampling approximates target distribution,
despite sampling from a potentially distinct draft distribution.
This is done by first creating a random target probability
distribution and a random draft probability distribution. We then
sample token ids from the rejection sampler using these draft
and target distributions. The samples are used to estimate
the output probability distribution, which we expect to approximate
the target distribution.
A basic distance metric is used to determine similarity between
distributions.
We expect that as we increase the number of samples,
the distance between the observed distribution and the target
distribution decreases. To measure this, we compare the distance
of the observed distribution against both the target distribution
and a uniform random distribution. We expect the distance between
the observed distribution and the target distribution to improve
much more than the distance improvement between the observed
distribution and the random distribution.
When draft_and_target_probs_equal=True, the draft and target
probabilities are exactly equal. Rejection sampling should
still work without any NaNs or exceptions.
"""
torch
.
set_default_device
(
"cpu"
)
set_random_seed
(
seed
)
helper
=
_CorrectnessTestHelper
(
vocab_size
=
10
,
rejection_sampler
=
RejectionSampler
(
use_flashinfer
=
use_flashinfer
),
)
draft_probs
,
target_probs
,
reference_probs
=
helper
.
generate_probs_for_test
(
draft_and_target_probs_equal
)
sample_sizes
=
[
10
,
100
,
1_000
,
10_000
,
100_000
]
distance_wrt_reference
:
list
[
float
]
=
[]
distance_wrt_target
:
list
[
float
]
=
[]
for
num_samples
in
sample_sizes
:
(
reference_vs_rejsample_dist
,
target_vs_rejsample_dist
)
=
helper
.
run_and_compare_distributions
(
draft_probs
,
target_probs
,
reference_probs
,
num_samples
,
)
distance_wrt_reference
.
append
(
reference_vs_rejsample_dist
)
distance_wrt_target
.
append
(
target_vs_rejsample_dist
)
relative_change_in_distance_wrt_target
=
get_ratio_first_to_last
(
distance_wrt_target
)
relative_change_in_distance_wrt_reference
=
get_ratio_first_to_last
(
distance_wrt_reference
)
print
(
f
"
{
num_samples
=
}
{
target_vs_rejsample_dist
=
:.
05
f
}
"
f
"
{
reference_vs_rejsample_dist
=
:.
05
f
}
"
)
print
(
f
"
{
num_samples
=
}
{
relative_change_in_distance_wrt_target
=
:.
02
f
}
"
f
"
{
relative_change_in_distance_wrt_reference
=
:.
02
f
}
"
)
relative_change_in_distance_wrt_target
=
get_ratio_first_to_last
(
distance_wrt_target
)
relative_change_in_distance_wrt_reference
=
get_ratio_first_to_last
(
distance_wrt_reference
)
expected_improvement_multiplier
=
20
assert
(
relative_change_in_distance_wrt_target
>
relative_change_in_distance_wrt_reference
*
expected_improvement_multiplier
)
def
get_ratio_first_to_last
(
elements
:
list
[
float
])
->
float
:
return
elements
[
0
]
/
elements
[
-
1
]
class
_CorrectnessTestHelper
:
"""Class that packages together logic required for the unit-level
rejection sampling correctness test.
"""
def
__init__
(
self
,
vocab_size
:
int
,
rejection_sampler
:
RejectionSampler
):
self
.
rejection_sampler
=
rejection_sampler
self
.
vocab_size
=
vocab_size
self
.
vocab_range
=
(
0
,
vocab_size
)
self
.
rejection_sampler
.
init_gpu_tensors
(
device
=
0
)
# Keep test simple, use k=1
self
.
k
=
1
# Bonus tokens not used, but rejection sampler requires
# correct shape.
self
.
num_bonus_tokens
=
1
def
generate_probs_for_test
(
self
,
draft_and_target_probs_equal
:
bool
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
draft_probs
,
target_probs
=
(
F
.
softmax
(
torch
.
rand
(
self
.
vocab_size
,
dtype
=
torch
.
float32
),
dim
=-
1
,
)
for
_
in
range
(
2
))
num_reference_probs
=
100
reference_probs
=
F
.
softmax
(
torch
.
rand
(
num_reference_probs
,
self
.
vocab_size
,
dtype
=
torch
.
float32
),
dim
=-
1
,
)
if
draft_and_target_probs_equal
:
target_probs
=
draft_probs
.
clone
()
return
draft_probs
,
target_probs
,
reference_probs
def
run_and_compare_distributions
(
self
,
draft_probs
:
torch
.
Tensor
,
target_probs
:
torch
.
Tensor
,
reference_probs
:
torch
.
Tensor
,
num_samples
:
int
)
->
tuple
[
float
,
float
]:
# Sample using rejection sampling.
rej_sample_probs
=
self
.
_estimate_rejection_sampling_pdf
(
draft_probs
,
target_probs
,
num_samples
)
# Average distance from reference probs.
reference_vs_rejsample_dist
=
torch
.
dist
(
reference_probs
,
rej_sample_probs
).
item
()
/
reference_probs
.
shape
[
0
]
target_vs_rejsample_dist
=
torch
.
dist
(
target_probs
,
rej_sample_probs
).
item
()
return
reference_vs_rejsample_dist
,
target_vs_rejsample_dist
def
_estimate_rejection_sampling_pdf
(
self
,
draft_probs
:
torch
.
Tensor
,
target_probs
:
torch
.
Tensor
,
num_samples
:
int
,
)
->
torch
.
Tensor
:
# Repeat draft probs num_samples times.
draft_probs
=
draft_probs
.
reshape
(
1
,
self
.
k
,
self
.
vocab_size
).
repeat
(
num_samples
,
1
,
1
)
# Repeat target probs num_samples * (k + 1) times.
# Rejection sampler requires bonus token probs, but they aren't used.
target_probs
=
target_probs
.
reshape
(
1
,
1
,
self
.
vocab_size
).
repeat
(
num_samples
,
self
.
k
+
1
,
1
)
# Randomly sample draft token ids from draft probs.
draft_token_ids
=
torch
.
multinomial
(
draft_probs
[:,
0
,
:],
num_samples
=
1
,
replacement
=
True
).
reshape
(
num_samples
,
self
.
k
)
# Bonus tokens not used but required.
bonus_token_ids
=
torch
.
zeros
((
1
,
self
.
num_bonus_tokens
),
dtype
=
torch
.
int64
,
device
=
"cuda"
).
repeat
(
num_samples
,
1
)
# Get output tokens via rejection sampling.
output_token_ids
=
self
.
rejection_sampler
(
target_probs
.
to
(
"cuda"
),
bonus_token_ids
.
to
(
"cuda"
),
draft_probs
.
to
(
"cuda"
),
draft_token_ids
.
to
(
"cuda"
))
# Remove bonus tokens
output_token_ids
=
output_token_ids
[:,
:
-
1
].
flatten
()
# Estimate probability density function
hist
=
torch
.
histogram
(
output_token_ids
.
to
(
dtype
=
torch
.
float
,
device
=
"cpu"
),
bins
=
self
.
vocab_size
,
range
=
self
.
vocab_range
,
density
=
True
)
return
hist
.
hist
tests/samplers/test_typical_acceptance_sampler.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for rejection sampling."""
import
pytest
import
torch
from
vllm.model_executor.layers.typical_acceptance_sampler
import
(
TypicalAcceptanceSampler
)
from
vllm.model_executor.utils
import
set_random_seed
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
)]
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
This file tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
def
get_zero_temperature_prob_dist
(
batch_size
,
k
,
vocab_size
):
"""
Generates a fake temperature zero probability distribution.
Returns:
1. A fake temperature zero probability distribution of shape
[batch_size, k, vocab_size]
2. Tensor of shape [batch_size, k] containing the token ids
of the probability 1.0 tokens at each position.
"""
# Simulate temperature 0 probability distribution for target probabilities
# and create target probabilities such that only 1 token id has
# probability 1.0
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
)
_
,
zero_temperature_token_ids
=
torch
.
max
(
probs
,
dim
=-
1
)
# set the probability of the tokens with ids in zero_temperature_token_ids
# to 1 and the rest to 0.
target_probs
=
torch
.
zeros_like
(
probs
).
scatter_
(
-
1
,
zero_temperature_token_ids
.
unsqueeze
(
-
1
),
1.0
)
return
target_probs
,
zero_temperature_token_ids
def
get_draft_token_ids
(
batch_size
:
int
,
k
:
int
,
vocab_size
:
int
,
token_ids_to_exclude
:
torch
.
Tensor
):
"""
Returns a tensor of shape [batch_size, k] of fake draft token ids
drawn randomly from a vocab of size vocab_size. We however ensure
that token_ids from token_ids_to_exclude are excluded at the
corresponding positions.
"""
draft_token_ids
=
torch
.
empty
(
batch_size
,
k
,
dtype
=
torch
.
long
)
for
i
in
range
(
batch_size
):
for
j
in
range
(
k
):
# Generate a random token ID excluding token_ids_to_exclude[i, j]
while
True
:
token_id
=
torch
.
randint
(
0
,
vocab_size
,
(
1
,
)).
item
()
if
token_id
!=
token_ids_to_exclude
[
i
,
j
]:
draft_token_ids
[
i
,
j
]
=
token_id
break
return
draft_token_ids
def
get_acceptance_sampler
(
posterior_threshold
:
float
=
0.03
,
posterior_alpha
:
float
=
0.9
,
strict_mode
:
bool
=
False
,
)
->
TypicalAcceptanceSampler
:
"""
Initializes and returns a TypicalAcceptanceSampler.
"""
return
TypicalAcceptanceSampler
(
posterior_threshold
,
posterior_alpha
,
strict_mode
)
@
pytest
.
mark
.
parametrize
(
"k"
,
list
(
range
(
1
,
6
)))
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
32
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_no_crash_with_varying_dims
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
device
:
str
):
"""
Tests that the TypicalAcceptancSampler forward succeeds for
different combinations of k, vocab_size, batch_size and num devices.
"""
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
()
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_with_bonus_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
# Verify that sampling succeeds for all cases.
typical_acceptance_sampler
(
target_with_bonus_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
@
pytest
.
mark
.
parametrize
(
"above_or_below_vocab_range"
,
[
"above"
,
"below"
])
@
pytest
.
mark
.
parametrize
(
"which_token_ids"
,
[
"bonus_token_ids"
,
"draft_token_ids"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_raises_when_vocab_oob
(
above_or_below_vocab_range
:
str
,
which_token_ids
:
str
,
device
:
str
):
"""
Tests that we throw an exception of the token ids fall outside
the bound of the provided vocabulary.
"""
k
=
3
batch_size
=
5
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_with_bonus_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
# Verify that appropriate exceptions are thrown for out
# of bound vocabs.
oob_token_ids
=
None
if
which_token_ids
==
"bonus_token_ids"
:
oob_token_ids
=
bonus_token_ids
elif
which_token_ids
==
"draft_token_ids"
:
oob_token_ids
=
draft_token_ids
else
:
raise
AssertionError
()
if
above_or_below_vocab_range
==
"above"
:
rogue_token_id
=
vocab_size
+
1
elif
above_or_below_vocab_range
==
"below"
:
rogue_token_id
=
-
1
else
:
raise
AssertionError
()
oob_token_ids
[
0
][
0
]
=
rogue_token_id
with
pytest
.
raises
(
AssertionError
):
typical_acceptance_sampler
(
target_with_bonus_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_uniform_target_distribution_accepts_all_tokens
(
seed
:
int
,
device
:
str
):
"""
Test the TypicalAcceptanceSampler with a uniform target probability
distribution.
This test verifies that when provided with a uniform target probability
distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
entropy of the uniform target distribution being high should lead to all
draft tokens being accepted.
"""
set_random_seed
(
seed
)
k
=
3
batch_size
=
5
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_with_bonus_probs
=
torch
.
rand
(
batch_size
,
k
+
1
,
vocab_size
,
dtype
=
torch
.
float32
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
output_token_ids
=
typical_acceptance_sampler
(
target_with_bonus_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
# We are using a uniform target probability distribution.
# For a uniform distribution the entropy is very high and it
# should lead to all draft tokens being accepted. Verify that.
assert
output_token_ids
.
shape
[
0
]
==
batch_size
assert
output_token_ids
.
shape
[
1
]
==
(
k
+
1
)
assert
torch
.
all
(
output_token_ids
[:,
-
1
]
==
bonus_token_ids
.
squeeze
())
assert
torch
.
all
(
output_token_ids
[:,
:
k
]
==
draft_token_ids
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_temperature_zero_target_distribution
(
seed
:
int
,
device
:
str
):
"""
Test the TypicalAcceptanceSampler with a zero-temperature target
probability distribution.
This test verifies that when using a zero-temperature target probability
distribution, where only one token has a probability of 1.0, the
TypicalAcceptanceSampler correctly rejects all draft tokens that do not
match this probability. Additionally, it ensures that when all draft
tokens are rejected, the sampler falls back to greedy sampling to select a
single token from the target distribution.
"""
set_random_seed
(
seed
)
k
=
3
batch_size
=
5
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# Simulate temperature 0 probability distribution for target probabilities
# and create target probabilities such that only 1 token id has
# probability 1.0
target_with_bonus_probs
,
zero_temperature_token_ids
=
\
get_zero_temperature_prob_dist
(
batch_size
,
k
+
1
,
vocab_size
)
zero_temperature_token_ids
=
zero_temperature_token_ids
[:,
:
-
1
]
# Populate draft_token_ids such that they exclude the token_ids
# with probability = 1.0
draft_token_ids
=
get_draft_token_ids
(
batch_size
,
k
,
vocab_size
,
zero_temperature_token_ids
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
# The target probaility distribution is a temperature zero distribution
# with zero entropy. Since our draft token ids don't match the probability
# 1.0 tokens in the target distribution we will reject all of them and
# fallback to the greedy sampling for selecting 1 token for each sequence.
# Verify the same.
output_token_ids
=
typical_acceptance_sampler
(
target_with_bonus_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
assert
output_token_ids
.
shape
[
0
]
==
batch_size
assert
output_token_ids
.
shape
[
1
]
==
(
k
+
1
)
assert
torch
.
all
(
output_token_ids
[:,
-
1
]
==
-
1
)
assert
torch
.
all
(
output_token_ids
[:,
0
]
==
zero_temperature_token_ids
[:,
0
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_mixed_target_distribution
(
seed
:
int
,
device
:
str
):
"""
Test the TypicalAcceptanceSampler with a mixed target probability
distribution.
This test ensures that the TypicalAcceptanceSampler handles a mixed
target probability distribution correctly. Specifically, it uses a
zero-temperature distribution for some sequences and a uniform
distribution for others. The test verifies that:
- For sequences with a zero-temperature distribution, only the token
with a probability of 1.0 is accepted, and all other tokens are rejected.
- For sequences with a uniform distribution, all draft tokens are
accepted.
"""
set_random_seed
(
seed
)
k
=
3
batch_size
=
4
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# For sequences 0 and 2 set the distribution to a temperature
# zero distribution. For sequences 1 and 3 set it to a uniform
# distribution.
target_with_bonus_probs
,
zero_temperature_token_ids
=
\
get_zero_temperature_prob_dist
(
batch_size
,
k
+
1
,
vocab_size
)
zero_temperature_token_ids
=
zero_temperature_token_ids
[:,
:
-
1
]
target_probs
=
target_with_bonus_probs
[:,
:
-
1
]
draft_token_ids
=
get_draft_token_ids
(
batch_size
,
k
,
vocab_size
,
zero_temperature_token_ids
)
uniform_probs
=
torch
.
rand
(
2
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
[[
1
,
3
]]
=
uniform_probs
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
output_token_ids
=
typical_acceptance_sampler
(
target_with_bonus_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
# verify the shape of output_token_ids
assert
output_token_ids
.
shape
[
0
]
==
batch_size
assert
output_token_ids
.
shape
[
1
]
==
(
k
+
1
)
# For sequences 0 and 2 verify that only 1 token is accepted
# which is the token with probability 1.0 in the target distribution
# at position 0.
assert
torch
.
all
(
output_token_ids
[[
0
,
2
],
1
:]
==
-
1
)
assert
(
torch
.
all
(
output_token_ids
[[
0
,
2
],
0
]
==
zero_temperature_token_ids
[[
0
,
2
],
0
]))
# For sequences 1 and 3 verify that all tokens are accepted since the
# target probability distribution is uniform. In addition verify that
# we also accept the bonus tokens.
assert
torch
.
all
(
output_token_ids
[[
1
,
3
],
:
-
1
]
==
draft_token_ids
[[
1
,
3
],
:])
assert
torch
.
all
(
output_token_ids
[[
1
,
3
],
-
1
]
!=
-
1
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_accept_tokens_partially
(
seed
:
int
,
device
:
str
):
"""
Test the TypicalAcceptanceSampler's behavior when only a subset of draft
tokens should be accepted.
This test verifies that the TypicalAcceptanceSampler correctly accepts or
rejects draft tokens based on a zero-temperature target probability
distribution. Specifically, it ensures that:
- When all draft tokens match tokens with a probability of 1.0 in the
target distribution, all draft tokens are accepted.
- When only some draft tokens match tokens with a probability of 1.0 in
the target distribution, only those matching tokens are accepted, and the
rest are rejected.
"""
set_random_seed
(
seed
)
k
=
5
batch_size
=
1
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# Create a temperature zero target probability distribution and ensure
# all draft token ids correspond to the tokens with 1.0 probability.
# Verify that all of them are accepted.
target_with_bonus_probs
,
zero_temperature_token_ids
=
\
get_zero_temperature_prob_dist
(
batch_size
,
k
+
1
,
vocab_size
)
zero_temperature_token_ids
=
zero_temperature_token_ids
[:,
:
-
1
]
draft_token_ids
=
zero_temperature_token_ids
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
output_token_ids
=
typical_acceptance_sampler
(
target_with_bonus_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
assert
output_token_ids
.
shape
[
0
]
==
batch_size
assert
output_token_ids
.
shape
[
1
]
==
(
k
+
1
)
assert
torch
.
all
(
output_token_ids
[:,
0
:
-
1
]
==
draft_token_ids
)
assert
torch
.
all
(
output_token_ids
[:,
-
1
]
==
bonus_token_ids
)
# Next only keep the first 2 draft tokens same as the zero temperature
# tokens. For the remaining 3 choose some other tokens. In the
# response we will expect the first 2 tokens to be the same as the
# draft tokens and the recovered token and rest as -1
draft_token_ids_to_replace
=
get_draft_token_ids
(
batch_size
,
k
,
vocab_size
,
zero_temperature_token_ids
)
draft_token_ids
=
torch
.
cat
(
(
draft_token_ids
[:,
:
2
],
draft_token_ids_to_replace
[:,
-
3
:]),
dim
=
1
)
output_token_ids
=
typical_acceptance_sampler
(
target_with_bonus_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
assert
output_token_ids
.
shape
[
0
]
==
batch_size
assert
output_token_ids
.
shape
[
1
]
==
(
k
+
1
)
assert
torch
.
all
(
output_token_ids
[:,
:
2
]
==
draft_token_ids
[:,
:
2
])
assert
torch
.
all
(
output_token_ids
[:,
2
]
==
target_with_bonus_probs
.
argmax
(
-
1
)[:,
2
])
assert
torch
.
all
(
output_token_ids
[:,
-
3
:]
==
-
1
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
1
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_accept_tokens_set_non_default_posteriors
(
seed
:
int
,
device
:
str
):
"""
Test the TypicalAcceptanceSampler with custom posterior thresholds and
alpha values. This test verifies that by modifying the posterior
thresholds and alpha values we can change the acceptance behavior of the
sampler.
"""
set_random_seed
(
seed
)
k
=
5
batch_size
=
1
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# Simulate temperature 0 probability distribution for target
# probabilities and create target probabilities such that only 1 token
# id has probability 1.0 and others have a very low probability of
# 0.00001. Populate draft_token_ids such that they exclude the token_ids
# with probability = 1.0. Without any changes to the posterior thresholds
# none of the draft tokens are accepted.
target_probs
,
zero_temperature_token_ids
=
get_zero_temperature_prob_dist
(
batch_size
,
k
+
1
,
vocab_size
)
zero_temperature_token_ids
=
zero_temperature_token_ids
[:,
:
-
1
]
target_probs
[
target_probs
==
0
]
=
0.00001
draft_token_ids
=
get_draft_token_ids
(
batch_size
,
k
,
vocab_size
,
zero_temperature_token_ids
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
output_token_ids
=
typical_acceptance_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
assert
output_token_ids
.
shape
[
0
]
==
batch_size
assert
output_token_ids
.
shape
[
1
]
==
(
k
+
1
)
assert
torch
.
all
(
output_token_ids
[:,
1
:
-
1
]
==
-
1
)
# Change the posterior threshold values to 0.0 so that we will
# now accept even draft tokens with very low probability in the
# target distribution. Simulate and verify the same.
typical_acceptance_sampler
=
TypicalAcceptanceSampler
(
strict_mode
=
True
,
posterior_threshold
=
0.0
,
posterior_alpha
=
0.0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
output_token_ids
=
typical_acceptance_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
=
None
,
draft_token_ids
=
draft_token_ids
)
assert
output_token_ids
.
shape
[
0
]
==
batch_size
assert
output_token_ids
.
shape
[
1
]
==
(
k
+
1
)
assert
torch
.
all
(
output_token_ids
[:,
0
:
-
1
]
==
draft_token_ids
)
assert
torch
.
all
(
output_token_ids
[:,
-
1
]
==
bonus_token_ids
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_get_recovered_token_ids
(
seed
:
int
,
device
:
str
):
"""
Test the TypicalAcceptanceSampler's method for generating
replacement token IDs.
This test verifies that the `_get_recovered_token_ids` method of the
TypicalAcceptanceSampler correctly identifies the token IDs to be used
as recovered token IDs based on the target probability distribution.
Specifically, it ensures that the method correctly identifies the
tokens with the highest probability for each sequence in the batch.
"""
set_random_seed
(
seed
)
k
=
10
batch_size
=
5
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
expected_replacement_tokens
=
torch
.
argmax
(
target_probs
,
dim
=-
1
)
actual_replacement_tokens
=
(
typical_acceptance_sampler
.
_get_recovered_token_ids
(
target_probs
))
assert
torch
.
all
(
expected_replacement_tokens
==
actual_replacement_tokens
)
tests/spec_decode/__init__.py
deleted
100644 → 0
View file @
9ffe905a
tests/spec_decode/conftest.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
tests/spec_decode/e2e/__init__.py
deleted
100644 → 0
View file @
9ffe905a
tests/spec_decode/e2e/conftest.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
from
itertools
import
cycle
from
typing
import
Optional
,
Union
import
pytest
import
torch
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.sequence
import
PromptLogprobs
,
SampleLogprobs
from
...models.utils
import
(
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
,
check_logprobs_close
,
check_outputs_equal
)
from
...utils
import
RemoteOpenAIServer
PROMPTS
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
"San Francisco is know for its"
,
"Facebook was created in 2004 by"
,
"Curious George is a"
,
"Python 3.11 brings improvements to its"
,
]
@
pytest
.
fixture
def
test_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
test_llm_kwargs
,
seed
):
def
generate
():
kwargs
=
{
**
common_llm_kwargs
,
**
per_test_common_llm_kwargs
,
**
test_llm_kwargs
,
}
llm
=
LLM
(
**
kwargs
)
if
seed
is
not
None
:
set_random_seed
(
seed
)
yield
llm
del
llm
cleanup_dist_env_and_memory
()
return
generate
def
maybe_assert_ngram_worker
(
llm
):
# Verify the proposer worker is ngram if ngram is specified.
if
(
llm
.
llm_engine
.
speculative_config
is
not
None
and
llm
.
llm_engine
.
speculative_config
.
method
==
"ngram"
):
from
vllm.spec_decode.ngram_worker
import
NGramWorker
assert
isinstance
(
llm
.
llm_engine
.
model_executor
.
driver_worker
.
proposer_worker
,
NGramWorker
)
def
get_output_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
)
->
tuple
[
list
[
str
],
list
[
list
[
int
]],
float
]:
tokens
:
list
[
str
]
=
[]
token_ids
:
list
[
list
[
int
]]
=
[]
acceptance_rate
:
float
=
-
1.0
for
llm
in
llm_generator
():
maybe_assert_ngram_worker
(
llm
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
token_ids
=
[
output
.
outputs
[
0
].
token_ids
for
output
in
outputs
]
tokens
=
[
output
.
outputs
[
0
].
text
for
output
in
outputs
]
# Fetch acceptance rate if logging is enabled.
if
stat_loggers
:
=
getattr
(
llm
.
llm_engine
,
"stat_loggers"
,
None
):
stat_logger
=
stat_loggers
[
"prometheus"
]
acceptance_rate
=
(
stat_logger
.
metrics
.
gauge_spec_decode_draft_acceptance_rate
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
())
del
llm
return
tokens
,
token_ids
,
acceptance_rate
def
check_logprobs_correctness
(
spec_outputs
:
Sequence
[
Union
[
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
]],
baseline_outputs
:
Sequence
[
Union
[
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
]],
disable_logprobs
:
bool
=
False
,
):
"""Compare sampled and prompt logprobs between baseline and spec decoding
"""
if
not
disable_logprobs
:
return
check_logprobs_close
(
outputs_0_lst
=
baseline_outputs
,
outputs_1_lst
=
spec_outputs
,
name_0
=
"org"
,
name_1
=
"sd"
,
)
# Check correctness when disable_logprobs == True
for
spec_output
,
baseline_output
in
zip
(
spec_outputs
,
baseline_outputs
):
# Check generated token logprobs.
spec_logprobs
=
spec_output
[
2
]
baseline_logprobs
=
baseline_output
[
2
]
_check_logprobs_when_output_disabled
(
spec_logprobs
,
baseline_logprobs
,
is_prompt_logprobs
=
False
)
# Check prompt logprobs too, if they exist
if
len
(
baseline_output
)
==
4
:
assert
len
(
spec_output
)
==
4
spec_prompt_logprobs
=
spec_output
[
3
]
baseline_prompt_logprobs
=
baseline_output
[
3
]
_check_logprobs_when_output_disabled
(
spec_prompt_logprobs
,
baseline_prompt_logprobs
,
is_prompt_logprobs
=
True
)
def
_check_logprobs_when_output_disabled
(
spec_logprobs
:
Union
[
Optional
[
PromptLogprobs
],
SampleLogprobs
],
baseline_logprobs
:
Union
[
Optional
[
PromptLogprobs
],
SampleLogprobs
],
is_prompt_logprobs
:
bool
=
False
,
):
# Prompt logprobs are optional
if
is_prompt_logprobs
and
baseline_logprobs
is
None
:
assert
spec_logprobs
is
None
return
assert
spec_logprobs
is
not
None
assert
baseline_logprobs
is
not
None
assert
len
(
spec_logprobs
)
==
len
(
baseline_logprobs
)
# For each generated position of the sequence.
for
pos
,
(
spec_pos_logprobs
,
baseline_pos_logprobs
)
in
enumerate
(
zip
(
spec_logprobs
,
baseline_logprobs
)):
# First prompt logprob is expected to be None
if
is_prompt_logprobs
and
baseline_pos_logprobs
is
None
:
assert
spec_pos_logprobs
is
None
assert
pos
==
0
continue
assert
spec_pos_logprobs
is
not
None
assert
baseline_pos_logprobs
is
not
None
# When disabled, the 1 logprob is returned with dummy values for the
# score and rank, but the token id should match the baseline model
assert
len
(
spec_pos_logprobs
)
==
1
(
spec_pos_logprob_token_id
,
spec_pos_logprob
)
=
next
(
iter
(
spec_pos_logprobs
.
items
()))
assert
spec_pos_logprob
.
rank
==
-
1
assert
spec_pos_logprob
.
logprob
==
0.0
if
isinstance
(
spec_pos_logprob_token_id
,
torch
.
Tensor
):
spec_pos_logprob_token_id
=
spec_pos_logprob_token_id
.
item
()
assert
spec_pos_logprob_token_id
in
baseline_pos_logprobs
def
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
max_output_len
:
int
,
seed
:
Optional
[
int
]
=
0
,
temperature
:
float
=
0.0
,
disable_seed
:
bool
=
False
,
ignore_eos
:
bool
=
True
,
ensure_all_accepted
:
bool
=
False
,
expected_acceptance_rate
:
Optional
[
float
]
=
None
,
logprobs
:
Optional
[
int
]
=
None
,
prompt_logprobs
:
Optional
[
int
]
=
None
,
disable_logprobs
:
bool
=
False
):
org_args
=
{
**
common_llm_kwargs
,
**
per_test_common_llm_kwargs
,
**
baseline_llm_kwargs
,
}
sd_args
=
{
**
common_llm_kwargs
,
**
per_test_common_llm_kwargs
,
**
test_llm_kwargs
,
}
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
PROMPTS
),
range
(
batch_size
))]
if
disable_seed
:
seed
=
None
sampling_params
=
SamplingParams
(
temperature
=
temperature
,
max_tokens
=
max_output_len
,
seed
=
seed
,
ignore_eos
=
ignore_eos
,
logprobs
=
logprobs
,
prompt_logprobs
=
prompt_logprobs
)
with
vllm_runner
(
**
org_args
)
as
vllm_model
:
org_outputs
=
vllm_model
.
generate_w_logprobs
(
prompts
,
sampling_params
)
with
vllm_runner
(
**
sd_args
)
as
vllm_model
:
if
ensure_all_accepted
or
expected_acceptance_rate
is
not
None
:
# Force log interval to be 0 to catch all metrics.
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_loggers
[
'prometheus'
]
stat_logger
.
local_interval
=
-
100
sd_outputs
=
vllm_model
.
generate_w_logprobs
(
prompts
,
sampling_params
)
if
ensure_all_accepted
or
expected_acceptance_rate
is
not
None
:
acceptance_rate
=
(
stat_logger
.
metrics
.
gauge_spec_decode_draft_acceptance_rate
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
())
if
ensure_all_accepted
:
assert
True
# FIXME: ci fails to log acceptance rate.
# It works locally.
# assert acceptance_rate == 1.0
if
expected_acceptance_rate
is
not
None
:
assert
acceptance_rate
>=
expected_acceptance_rate
-
1e-2
# Only pass token entries, not the logprobs
check_outputs_equal
(
outputs_0_lst
=
[
out
[
0
:
2
]
for
out
in
org_outputs
],
outputs_1_lst
=
[
out
[
0
:
2
]
for
out
in
sd_outputs
],
name_0
=
"org"
,
name_1
=
"sd"
)
# Check logprobs if requested
if
logprobs
is
not
None
or
prompt_logprobs
is
not
None
:
check_logprobs_correctness
(
sd_outputs
,
org_outputs
,
disable_logprobs
)
def
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
max_output_len
:
int
,
seed
:
int
=
0
,
temperature
:
float
=
0.0
,
logprobs
:
Optional
[
int
]
=
None
):
"""Helper method that compares the outputs of both the baseline LLM and
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
the same when temperature is zero.
"""
arg1
=
common_llm_kwargs
+
per_test_common_llm_kwargs
+
baseline_llm_kwargs
arg2
=
common_llm_kwargs
+
per_test_common_llm_kwargs
+
test_llm_kwargs
env1
=
env2
=
None
max_wait_seconds
=
240
results
=
[]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
PROMPTS
),
range
(
batch_size
))]
for
args
,
env
in
((
arg1
,
env1
),
(
arg2
,
env2
)):
with
RemoteOpenAIServer
(
model
,
args
,
env_dict
=
env
,
max_wait_seconds
=
max_wait_seconds
)
as
server
:
client
=
server
.
get_client
()
completion
=
client
.
completions
.
create
(
model
=
model
,
prompt
=
prompts
,
max_tokens
=
max_output_len
,
seed
=
seed
,
temperature
=
temperature
,
logprobs
=
logprobs
)
results
.
append
({
"test"
:
"seeded_sampling"
,
"text"
:
[
choice
.
text
for
choice
in
completion
.
choices
],
"logprobs"
:
[
choice
.
logprobs
for
choice
in
completion
.
choices
],
"finish_reason"
:
[
choice
.
finish_reason
for
choice
in
completion
.
choices
],
"usage"
:
completion
.
usage
,
})
n
=
len
(
results
)
//
2
arg1_results
=
results
[:
n
]
arg2_results
=
results
[
n
:]
# Separate logprobs to avoid asserting exact equality.
arg1_logprobs
=
[
r
.
pop
(
"logprobs"
)
for
r
in
arg1_results
]
arg2_logprobs
=
[
r
.
pop
(
"logprobs"
)
for
r
in
arg2_results
]
for
arg1_result
,
arg2_result
in
zip
(
arg1_results
,
arg2_results
):
assert
arg1_result
==
arg2_result
,
(
f
"Results for
{
model
=
}
are not the same with
{
arg1
=
}
and
{
arg2
=
}
. "
f
"
{
arg1_result
=
}
!=
{
arg2_result
=
}
"
)
if
logprobs
:
for
logs1
,
logs2
in
zip
(
arg1_logprobs
,
arg2_logprobs
):
for
l1
,
l2
in
zip
(
logs1
,
logs2
):
assert
l1
.
tokens
==
l2
.
tokens
tests/spec_decode/e2e/test_compatibility.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm
import
SamplingParams
from
.conftest
import
get_output_from_llm_generator
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"meta-llama/Llama-3.2-1B-Instruct"
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
# Speculative max model len > overridden max model len should raise.
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
129
,
},
"max_model_len"
:
128
,
},
{
# Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
2048
+
1
,
},
},
{
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
131072
+
1
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_xfail_spec_max_model_len
(
test_llm_generator
):
"""Verify that speculative decoding validates speculative_max_model_len.
"""
output_len
=
128
temperature
=
0.0
prompts
=
[
"Hello, my name is"
,
]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
)
with
pytest
.
raises
(
ValueError
,
match
=
"cannot be larger than"
):
get_output_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
tests/spec_decode/e2e/test_eagle_correctness.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""This docstring details important information on the testing methodology.
Most of the tests rely on "greedy equality", where we expect the output of
speculative decoding on a sequence to exactly match the output of normal non-
speculative decoding.
Since speculative decoding with rejection sampling guarantees that the output
distribution matches the target model's output distribution (up to hardware
numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
equality.
However, we still need to verify below scenario could be passed:
* Batch size 1 greedy equality
* Batch size >1 greedy equality
* Test greedy equality under preemption
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, EAGLE would not break the
correctness for the target model outputs.
"""
import
pytest
from
.conftest
import
run_equality_correctness_test
# main model
MAIN_MODEL
=
"JackFram/llama-68m"
# speculative model
SPEC_MODEL
=
"abhigoyal/vllm-eagle-llama-68m-random"
# max. number of speculative tokens: this corresponds to
# num_heads in the config.json of the speculator model.
MAX_SPEC_TOKENS
=
4
# precision
PRECISION
=
"float32"
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_e2e_greedy_correctness
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"disable_logprobs"
:
False
,
},
},
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"disable_logprobs"
:
True
,
},
}])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
1
,
6
])
def
test_eagle_e2e_greedy_logprobs
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
,
logprobs
:
int
):
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
,
logprobs
=
logprobs
,
prompt_logprobs
=
logprobs
,
disable_logprobs
=
test_llm_kwargs
[
"speculative_config"
]
[
"disable_logprobs"
])
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"enforce_eager"
:
False
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_e2e_greedy_correctness_cuda_graph
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify greedy equality with cuda graph enabled and different
batch sizes."""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use small output len for fast test.
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_e2e_greedy_correctness_with_preemption
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify greedy equality, even when some sequences are preempted mid-
generation.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
k
,
},
}
# Try a range of num. speculative tokens
for
k
in
range
(
1
,
1
+
MAX_SPEC_TOKENS
)
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_different_k
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify that eagle speculative decoding produces exact equality
to without spec decode with different values of num_speculative_tokens.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model_name"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"disable_by_batch_size"
:
4
,
},
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_disable_queue
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify that eagle speculative decoding produces exact equality
to without spec decode when speculation is disabled for large
batch sizes.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
"float16"
,
# Main model
"model_name"
:
"meta-llama/Llama-2-7b-chat-hf"
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-llama2-chat-7B"
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_llama2_eagle_e2e_greedy_correctness
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
16
,
"max_model_len"
:
(
2
+
256
//
16
)
*
16
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
"float16"
,
# Main model
"model_name"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_llama3_eagle_e2e_greedy_correctness
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
16
,
"max_model_len"
:
(
2
+
256
//
16
)
*
16
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
"float16"
,
# Main model
"model_name"
:
"Qwen/Qwen2-7B-Instruct"
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-Qwen2-7B-Instruct"
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_qwen2_eagle_e2e_greedy_correctness
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
,
temperature
=
0.0
)
if
__name__
==
"__main__"
:
import
pytest
pytest
.
main
([
__file__
])
tests/spec_decode/e2e/test_integration.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests which cover integration of the speculative decoding framework with
other features, e.g. cuda graphs.
"""
import
pytest
from
.conftest
import
run_equality_correctness_test
MAIN_MODEL
=
"JackFram/llama-68m"
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model_name"
:
"JackFram/llama-68m"
,
# Verify equality when cuda graphs allowed.
"enforce_eager"
:
False
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
# Identical models.
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_cuda_graph
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify spec decode equality when cuda graphs are enabled.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model_name"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
# Explicitly specify draft model quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"gptq"
,
},
},
# Explicitly specify GPTQ-based draft model to use marlin quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"num_speculative_tokens"
:
5
,
"quantization"
:
"marlin"
,
},
},
# Not explicitly specify draft model quantization
{
"speculative_config"
:
{
"model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"num_speculative_tokens"
:
5
,
"quantization"
:
None
,
},
},
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_speculative_model_quantization_config
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with draft model quantization configs.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model_name"
:
MAIN_MODEL
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"disable_mqa_scorer"
:
True
,
},
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_mqa_scorer
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify that speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
tests/spec_decode/e2e/test_integration_dist_tp2.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests which cover integration of the speculative decoding framework with
tensor parallelism.
"""
import
json
from
typing
import
Optional
import
pytest
import
torch
from
vllm.platforms
import
current_platform
from
.conftest
import
run_equality_correctness_test_tp
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor-parallel-size"
,
"2"
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
}),
],
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"ngram"
,
"num_speculative_tokens"
:
5
,
"prompt_lookup_max"
:
3
,
}),
],
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_target_model_tp_gt_1
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify greedy equality when tensor parallelism is used.
"""
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"hip is not well-supported yet"
)
run_equality_correctness_test_tp
(
"JackFram/llama-68m"
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor_parallel_size"
,
"2"
,
# precision
"--dtype"
,
"bfloat16"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
]),
(
"ibm-granite/granite-3b-code-instruct"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"ibm-granite/granite-3b-code-instruct"
,
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_draft_model_tp_lt_target_model_tp2
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with smaller tp for draft models.
"""
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor_parallel_size"
,
"2"
,
# precision
"--dtype"
,
"bfloat16"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[
"--enable-chunked-prefill"
,
"False"
],
[
"--enable-chunked-prefill"
,
"True"
,
"--max-num-batched-tokens"
,
"4"
,
"--max-num-seqs"
,
"4"
]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
}),
]),
(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
None
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
,
logprobs
=
logprobs
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor_parallel_size"
,
"2"
,
# precision
"--dtype"
,
"bfloat16"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[
"--enable-chunked-prefill"
,
"False"
],
[
"--enable-chunked-prefill"
,
"True"
,
"--max-num-batched-tokens"
,
"4"
,
"--max-num-seqs"
,
"4"
]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"disable_logprobs"
:
False
,
}),
]),
(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
"disable_logprobs"
:
False
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2_with_logprobs
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
,
logprobs
=
logprobs
)
tests/spec_decode/e2e/test_integration_dist_tp4.py
deleted
100644 → 0
View file @
9ffe905a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests which cover integration of the speculative decoding framework with
tensor parallelism.
"""
import
json
import
openai
import
pytest
import
torch
from
.conftest
import
run_equality_correctness_test_tp
MAIN_MODEL
=
"JackFram/llama-68m"
SPEC_MODEL
=
"JackFram/llama-68m"
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
# Skip cuda graph recording for fast test.
"--enforce_eager"
,
"--tensor-parallel-size"
,
"4"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
[],
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
#TODO(wooyeon): add spec_draft_dp=2 case
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
f
"
{
SPEC_MODEL
}
"
,
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
}),
],
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_draft_model_tp_lt_target_model_tp4
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with smaller tp for draft models.
"""
run_equality_correctness_test_tp
(
MAIN_MODEL
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor-parallel-size"
,
"4"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
[
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config"
,
json
.
dumps
({
"model"
:
f
"
{
SPEC_MODEL
}
"
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
32
,
}),
],
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# This must be a good bit larger than speculative_max_model_len so that
# we can test the case where all seqs are skipped, but still small to
# ensure fast test.
64
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_skip_speculation
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify job failure with RuntimeError when all sequences skip speculation.
We do this by setting the max model len of the draft model to an
artificially low value, such that when the sequences grow beyond it, they
are skipped in speculative decoding.
TODO: fix it to pass without raising Error. (#5814)
"""
with
pytest
.
raises
(
(
openai
.
APIConnectionError
,
openai
.
InternalServerError
)):
run_equality_correctness_test_tp
(
MAIN_MODEL
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
output_len
,
seed
,
temperature
=
0.0
)
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment