Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6d2051cc
Commit
6d2051cc
authored
Oct 21, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev
parents
2c7f740a
a2c71c54
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
365 additions
and
334 deletions
+365
-334
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+0
-82
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+7
-6
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+28
-11
tests/quantization/test_ipex_quant.py
tests/quantization/test_ipex_quant.py
+28
-0
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+2
-2
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+18
-32
tests/spec_decode/e2e/test_compatibility.py
tests/spec_decode/e2e/test_compatibility.py
+10
-50
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+0
-18
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+41
-5
tests/spec_decode/e2e/test_integration_dist_tp2.py
tests/spec_decode/e2e/test_integration_dist_tp2.py
+0
-6
tests/spec_decode/e2e/test_integration_dist_tp4.py
tests/spec_decode/e2e/test_integration_dist_tp4.py
+0
-6
tests/spec_decode/e2e/test_logprobs.py
tests/spec_decode/e2e/test_logprobs.py
+0
-14
tests/spec_decode/e2e/test_medusa_correctness.py
tests/spec_decode/e2e/test_medusa_correctness.py
+46
-18
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+40
-24
tests/spec_decode/e2e/test_multistep_correctness.py
tests/spec_decode/e2e/test_multistep_correctness.py
+0
-36
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
+45
-15
tests/spec_decode/e2e/test_seed.py
tests/spec_decode/e2e/test_seed.py
+0
-3
tests/spec_decode/test_multi_step_worker.py
tests/spec_decode/test_multi_step_worker.py
+4
-2
tests/spec_decode/test_scorer.py
tests/spec_decode/test_scorer.py
+91
-0
tests/spec_decode/test_spec_decode_worker.py
tests/spec_decode/test_spec_decode_worker.py
+5
-4
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
tests/prefix_caching/test_prefix_caching.py
View file @
6d2051cc
...
...
@@ -2,14 +2,9 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
from
typing
import
List
import
pytest
from
tests.kernels.utils
import
override_backend_env_variable
from
vllm.block
import
PhysicalTokenBlock
from
vllm.core.block_manager_v1
import
CachedBlockAllocator
from
vllm.utils
import
Device
from
..models.utils
import
check_outputs_equal
...
...
@@ -18,86 +13,11 @@ MODELS = [
]
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
16
])
def
test_block_allocator
(
block_size
:
int
,
num_blocks
:
int
,
):
block_hash
=
1
block_allocator
=
CachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_blocks
)
# Allocate two PysicalTokenBlocks with the same hash and check
# that they are the same PhysicalTokenBlock
first_block
=
block_allocator
.
allocate
(
block_hash
,
0
)
second_block
=
block_allocator
.
allocate
(
block_hash
,
0
)
assert
(
first_block
==
second_block
)
assert
(
second_block
.
ref_count
==
2
)
# Check metric: 1 hit of 2 queries
assert
block_allocator
.
get_prefix_cache_hit_rate
()
==
0.5
# Free the first_block and confirm that the ref_count is correctly
# decremented on the second block
block_allocator
.
free
(
first_block
)
assert
(
second_block
.
ref_count
==
1
)
# Free the second block
block_allocator
.
free
(
second_block
)
# Reallocate the first block and confirm that, even after the block
# had its ref_count go to 0, we still get the same block back
first_block
=
block_allocator
.
allocate
(
block_hash
,
0
)
assert
(
first_block
==
second_block
)
assert
(
first_block
.
block_hash
==
block_hash
)
# Allocate one more time to get 3/4 hit rate for easy checking
block_allocator
.
allocate
(
block_hash
,
0
)
assert
block_allocator
.
get_prefix_cache_hit_rate
()
==
0.75
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
16
])
def
test_eviction
(
num_blocks
:
int
,
):
block_size
=
16
block_allocator
=
CachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_blocks
)
blocks
:
List
[
PhysicalTokenBlock
]
=
[]
for
i
in
range
(
num_blocks
):
# use i as the block_hash
blocks
.
append
(
block_allocator
.
allocate
(
i
,
0
))
#Free all blocks
for
block
in
blocks
:
block_allocator
.
free
(
block
)
# Allocate a new block and confirm that it's the first block freed.
# I.E The Least Recently Used block
new_block_hash
=
block_size
new_block
=
block_allocator
.
allocate
(
new_block_hash
,
0
)
assert
(
new_block
==
blocks
[
0
])
assert
(
new_block
.
block_hash
==
new_block_hash
)
# Reallocate the second in blocks to remove it from the free list
realloc_block_hash
=
1
realloc_block
=
block_allocator
.
allocate
(
realloc_block_hash
,
0
)
assert
(
realloc_block
==
blocks
[
realloc_block_hash
])
assert
(
realloc_block
.
block_hash
==
realloc_block_hash
)
# Allocate a new block and confirm that it's not the realloc_block,
# since the realloc_block shouldn't be in the free list
new_block_hash
=
block_size
+
1
new_block
=
block_allocator
.
allocate
(
new_block_hash
,
0
)
assert
(
realloc_block
!=
new_block
)
assert
(
new_block
.
block_hash
==
new_block_hash
)
assert
(
new_block
.
block_number
==
2
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"cached_position"
,
[
0
,
1
])
@
pytest
.
mark
.
parametrize
(
"use_v2_block_manager"
,
[
False
,
True
])
def
test_mixed_requests
(
hf_runner
,
vllm_runner
,
...
...
@@ -107,7 +27,6 @@ def test_mixed_requests(
dtype
:
str
,
max_tokens
:
int
,
cached_position
:
int
,
use_v2_block_manager
:
bool
,
monkeypatch
,
)
->
None
:
"""
...
...
@@ -125,7 +44,6 @@ def test_mixed_requests(
model
,
dtype
=
dtype
,
enable_prefix_caching
=
True
,
use_v2_block_manager
=
use_v2_block_manager
,
)
as
vllm_model
:
# Run the first prompt so the cache is populated
vllm_outputs
=
vllm_model
.
generate_greedy
([
cached_prompt
],
max_tokens
)
...
...
tests/quantization/test_bitsandbytes.py
View file @
6d2051cc
...
...
@@ -9,22 +9,22 @@ import pytest
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
fork_new_process_for_each_test
from
tests.utils
import
fork_new_process_for_each_test
models_4bit_to_test
=
[
(
'huggyllama/llama-7b'
,
'
quantize model inflight
'
),
(
"facebook/opt-125m"
,
"
quantize
opt
model inflight
"
),
]
models_pre_qaunt_4bit_to_test
=
[
(
'lllyasviel/omost-llama-3-8b-4bits'
,
'read pre-quantized 4-bit NF4 model'
),
(
'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'
,
'read pre-quantized 4-bit FP4 model'
),
(
'poedator/opt-125m-bnb-4bit'
,
'read pre-quantized 4-bit NF4 opt model'
),
]
models_pre_quant_8bit_to_test
=
[
(
'meta-llama/Llama-Guard-3-8B-INT8'
,
'read pre-quantized 8-bit model'
),
(
'meta-llama/Llama-Guard-3-8B-INT8'
,
'read pre-quantized llama 8-bit model'
),
(
"yec019/fbopt-350m-8bit"
,
"read pre-quantized 8-bit opt model"
),
]
...
...
@@ -133,6 +133,7 @@ def validate_generated_texts(hf_runner,
hf_str
=
hf_log
[
"generated_text"
]
vllm_str
=
vllm_log
[
"generated_text"
]
prompt
=
hf_log
[
"prompt"
]
assert
hf_str
==
vllm_str
,
(
f
"Model:
{
model_name
}
"
f
"Mismatch between HF and vLLM outputs:
\n
"
f
"Prompt:
{
prompt
}
\n
"
...
...
tests/quantization/test_compressed_tensors.py
View file @
6d2051cc
...
...
@@ -2,26 +2,28 @@
Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
from
typing
import
Optional
import
pytest
import
torch
from
compressed_tensors.quantization
import
QuantizationType
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A8Int8
,
CompressedTensorsW8A16Fp8
,
CompressedTensorsWNA16
)
from
vllm.model_executor.layers.quantization.compressed_tensors.utils
import
(
QuantizationType
)
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"tensor"
,
QuantizationType
.
INT
,
2560
),
(
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
,
"channel"
,
QuantizationType
.
INT
,
2560
),
])
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"tensor"
,
QuantizationType
.
INT
,
2560
,
True
),
(
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
,
"channel"
,
QuantizationType
.
INT
,
2560
,
True
),
(
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"
,
"tensor"
,
QuantizationType
.
INT
,
2560
,
False
)])
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
,
model_args
):
model_path
,
strategy
,
quant_type
,
shape_0
=
model_args
model_path
,
strategy
,
quant_type
,
shape_0
,
is_symmetric
=
model_args
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -31,6 +33,18 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
gate_up_proj
=
layer
.
mlp
.
gate_up_proj
down_proj
=
layer
.
mlp
.
down_proj
# assert zp for symmetric and asymmetric cases
def
zp_valid
(
zp
:
Optional
[
torch
.
Tensor
]):
if
is_symmetric
:
return
zp
is
None
return
zp
is
not
None
and
zp
.
dtype
is
torch
.
int32
assert
zp_valid
(
qkv_proj
.
input_zero_point
)
assert
zp_valid
(
o_proj
.
input_zero_point
)
assert
zp_valid
(
gate_up_proj
.
input_zero_point
)
assert
zp_valid
(
down_proj
.
input_zero_point
)
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
o_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
gate_up_proj
.
quant_method
,
...
...
@@ -69,9 +83,12 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[
(
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
,
"tensor"
),
(
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"
,
"tensor"
),
(
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
,
"channel"
),
(
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"
,
"channel"
),
])
def
test_compressed_tensors_w8a8_dyna
n
mic_per_token
(
vllm_runner
,
model_args
):
def
test_compressed_tensors_w8a8_dynamic_per_token
(
vllm_runner
,
model_args
):
model_path
,
strategy
=
model_args
with
vllm_runner
(
model_path
,
dtype
=
torch
.
float16
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
...
...
@@ -160,4 +177,4 @@ def test_compressed_tensors_kv_cache(vllm_runner):
model_path
=
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
output
=
llm
.
generate_greedy
(
"Hello world!"
,
max_tokens
=
20
)
assert
output
\ No newline at end of file
assert
output
tests/quantization/test_ipex_quant.py
0 → 100644
View file @
6d2051cc
"""Test model set-up and inference for quantized HF models supported
on the CPU backend using IPEX (including AWQ).
Validating the configuration and printing results for manual checking.
Run `pytest tests/quantization/test_ipex_quant.py`.
"""
import
pytest
from
vllm.platforms
import
current_platform
MODELS
=
[
"casperhansen/llama-3-8b-instruct-awq"
,
]
DTYPE
=
[
"bfloat16"
]
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
(),
reason
=
"only supports the CPU backend."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPE
)
def
test_ipex_quant
(
vllm_runner
,
model
,
dtype
):
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
assert
output
print
(
output
)
tests/samplers/test_beam_search.py
View file @
6d2051cc
...
...
@@ -33,8 +33,8 @@ def test_beam_search_single_input(
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_beam_search
_new
(
example_prompts
,
beam_width
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_texts
=
hf_outputs
[
i
]
...
...
tests/samplers/test_sampler.py
View file @
6d2051cc
import
itertools
import
random
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
unittest.mock
import
Mock
,
patch
...
...
@@ -158,26 +159,6 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
assert
first_sampler_output
==
second_sampler_output
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_all_beam
(
seed
:
int
,
device
:
str
):
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
_
,
fake_logits
,
sampler
=
_prepare_test
(
batch_size
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
best_of
=
2
,
use_beam_search
=
True
,
)
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
sampling_params
,
device
)
# no assertion here as I am not sure how to determine whether
# the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler
# when handling an all-beam search case.
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_min_tokens_penalty
(
seed
:
int
,
device
:
str
):
...
...
@@ -433,7 +414,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
=
seq_lens
if
seq_lens
else
None
,
query_lens
=
seq_lens
if
seq_lens
else
Non
e
,
query_lens
=
seq_lens
if
seq_lens
else
[
1
]
*
batch_siz
e
,
device
=
device
,
pin_memory
=
is_pin_memory_available
())
# the logits tensor is modified in-place by the sampler
...
...
@@ -478,7 +459,7 @@ def test_sampler_mixed(seed: int, device: str):
seq_lens
:
List
[
int
]
=
[]
for
i
in
range
(
batch_size
):
expected
:
Optional
[
List
[
int
]]
=
None
sampling_type
=
random
.
randint
(
0
,
3
)
sampling_type
=
random
.
randint
(
0
,
2
)
if
sampling_type
==
0
:
sampling_params
=
SamplingParams
(
temperature
=
0
)
expected
=
[
int
(
torch
.
argmax
(
fake_logits
[
i
],
dim
=-
1
).
item
())]
...
...
@@ -497,10 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
for
idx
in
range
(
n
):
fake_logits
[
i
,
i
+
idx
]
=
1e2
expected
=
list
(
range
(
i
,
i
+
n
))
else
:
sampling_params
=
SamplingParams
(
temperature
=
0
,
use_beam_search
=
True
,
best_of
=
2
)
expected_tokens
.
append
(
expected
)
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
...
...
@@ -529,9 +507,6 @@ def test_sampler_mixed(seed: int, device: str):
zip
(
sampler_output
,
seq_group_metadata_list
)):
assert
metadata
.
sampling_params
is
not
None
if
metadata
.
sampling_params
.
use_beam_search
:
continue
if
(
metadata
.
sampling_params
.
seed
is
not
None
and
expected_tokens
[
i
]
is
None
):
# Record seeded random result to compare with results of
...
...
@@ -596,8 +571,19 @@ def test_sampler_top_k_top_p(seed: int, device: str):
generation_config
=
GenerationConfig
(
top_k
=
top_k
,
top_p
=
top_p
,
do_sample
=
True
)
warpers
=
generation_model
.
_get_logits_warper
(
generation_config
,
device
)
assert
len
(
warpers
)
==
2
# top_p and top_k
@
dataclass
class
MockConfig
:
is_encoder_decoder
:
bool
=
False
generation_model
.
config
=
MockConfig
()
# needed by the following method
generation_model
.
_prepare_special_tokens
(
generation_config
,
device
=
device
)
processors
=
generation_model
.
_get_logits_processor
(
generation_config
,
None
,
None
,
None
,
[],
device
=
device
)
assert
len
(
processors
)
==
2
# top_p and top_k
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
seq_lens
:
List
[
int
]
=
[]
...
...
@@ -639,7 +625,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
assert
sample_probs
is
not
None
hf_probs
=
warpe
rs
(
torch
.
zeros_like
(
fake_logits
),
fake_logits
.
clone
())
hf_probs
=
processo
rs
(
torch
.
zeros_like
(
fake_logits
),
fake_logits
.
clone
())
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
torch
.
testing
.
assert_close
(
hf_probs
,
sample_probs
,
rtol
=
0.0
,
atol
=
1e-5
)
assert
torch
.
equal
(
hf_probs
.
eq
(
0
),
sample_probs
.
eq
(
0
))
...
...
tests/spec_decode/e2e/test_compatibility.py
View file @
6d2051cc
...
...
@@ -5,16 +5,11 @@ from vllm import SamplingParams
from
.conftest
import
get_output_from_llm_generator
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"JackFram/llama-68m"
,
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"JackFram/llama-68m"
,
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
"enable_chunked_prefill"
:
True
,
...
...
@@ -44,16 +39,11 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
sampling_params
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"meta-llama/Llama-2-7b-chat-hf"
,
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"meta-llama/Llama-2-7b-chat-hf"
,
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
...
...
@@ -94,33 +84,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
with
pytest
.
raises
(
ValueError
,
match
=
"cannot be larger than"
):
get_output_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"JackFram/llama-68m"
,
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_xfail_block_manager_v1
(
test_llm_generator
):
"""Verify that speculative decoding with block manager v1 fails.
"""
output_len
=
128
temperature
=
0.0
prompts
=
[
"Hello, my name is"
,
]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
)
with
pytest
.
raises
(
ValueError
,
match
=
"Speculative decoding requires usage of the V2"
):
get_output_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
tests/spec_decode/e2e/test_eagle_correctness.py
View file @
6d2051cc
...
...
@@ -43,9 +43,6 @@ PRECISION = "float32"
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -86,9 +83,6 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -143,9 +137,6 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
[{
"enforce_eager"
:
False
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -191,9 +182,6 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -235,9 +223,6 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -283,9 +268,6 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
tests/spec_decode/e2e/test_integration.py
View file @
6d2051cc
...
...
@@ -12,8 +12,6 @@ MAIN_MODEL = "JackFram/llama-68m"
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Verify equality when cuda graphs allowed.
"enforce_eager"
:
False
,
...
...
@@ -57,9 +55,6 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
...
...
@@ -102,3 +97,44 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model_name"
:
MAIN_MODEL
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_disable_mqa_scorer"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_mqa_scorer
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify that ngram speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
tests/spec_decode/e2e/test_integration_dist_tp2.py
View file @
6d2051cc
...
...
@@ -17,9 +17,6 @@ from .conftest import run_equality_correctness_test_tp
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
# Required for spec decode.
"--use-v2-block-manager"
,
"--tensor-parallel-size"
,
"2"
]])
...
...
@@ -74,9 +71,6 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
# Required for spec decode.
"--use_v2_block_manager"
,
"--tensor_parallel_size"
,
"2"
,
...
...
tests/spec_decode/e2e/test_integration_dist_tp4.py
View file @
6d2051cc
...
...
@@ -19,9 +19,6 @@ SPEC_MODEL = "JackFram/llama-68m"
[[
# Skip cuda graph recording for fast test.
"--enforce_eager"
,
# Required for spec decode.
"--use-v2-block-manager"
,
"--tensor-parallel-size"
,
"4"
,
]])
...
...
@@ -71,9 +68,6 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
# Required for spec decode.
"--use-v2-block-manager"
,
"--tensor-parallel-size"
,
"4"
,
]])
...
...
tests/spec_decode/e2e/test_logprobs.py
View file @
6d2051cc
...
...
@@ -14,9 +14,6 @@ from .conftest import run_equality_correctness_test
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -67,9 +64,6 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -119,9 +113,6 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -173,9 +164,6 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -251,8 +239,6 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
"model_name"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_medusa_correctness.py
View file @
6d2051cc
...
...
@@ -45,9 +45,6 @@ PRECISION = "float32"
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -93,9 +90,6 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -151,9 +145,6 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
[{
"enforce_eager"
:
False
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -204,9 +195,6 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -253,9 +241,6 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -306,9 +291,6 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -350,6 +332,52 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model_name"
:
MAIN_MODEL
,
"speculative_model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"speculative_disable_by_batch_size"
:
4
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_disable_mqa_scorer"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_mqa_scorer
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify that speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
if
__name__
==
"__main__"
:
import
pytest
pytest
.
main
([
__file__
])
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
6d2051cc
...
...
@@ -47,9 +47,6 @@ PRECISION = "float32"
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -94,9 +91,6 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -149,9 +143,6 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -195,9 +186,6 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
...
...
@@ -258,9 +246,6 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -311,9 +296,6 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -366,9 +348,6 @@ def test_mlp_e2e_greedy_correctness_with_padding(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -419,9 +398,6 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
...
...
@@ -460,3 +436,43 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model_name"
:
MAIN_MODEL
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"speculative_model"
:
SPEC_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_disable_mqa_scorer"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_mqa_scorer
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify that speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
tests/spec_decode/e2e/test_multistep_correctness.py
View file @
6d2051cc
...
...
@@ -55,9 +55,6 @@ from .conftest import (get_output_from_llm_generator,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
...
...
@@ -124,9 +121,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
}])
...
...
@@ -190,9 +184,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
}])
...
...
@@ -246,9 +237,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
...
...
@@ -303,9 +291,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
}])
...
...
@@ -353,9 +338,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
}])
...
...
@@ -404,9 +386,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
...
...
@@ -454,9 +433,6 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
...
...
@@ -514,9 +490,6 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -570,9 +543,6 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -611,9 +581,6 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -660,9 +627,6 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_ngram_correctness.py
View file @
6d2051cc
...
...
@@ -35,9 +35,6 @@ from .conftest import run_equality_correctness_test
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
}])
...
...
@@ -82,9 +79,6 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
}])
...
...
@@ -145,9 +139,6 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
...
...
@@ -195,9 +186,6 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -254,9 +242,6 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
@@ -292,3 +277,48 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model_name"
:
"JackFram/llama-68m"
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"speculative_model"
:
"[ngram]"
,
"num_speculative_tokens"
:
5
,
"ngram_prompt_lookup_max"
:
3
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_disable_mqa_scorer"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_ngram_scorer
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
:
int
,
output_len
:
int
,
seed
:
int
):
"""Verify that ngram speculative decoding generates the same output
with batch expansion scorer and mqa scorer.
"""
run_equality_correctness_test
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
output_len
,
seed
=
seed
,
temperature
=
0.0
)
tests/spec_decode/e2e/test_seed.py
View file @
6d2051cc
...
...
@@ -17,9 +17,6 @@ SPEC_MODEL = "JackFram/llama-160m"
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# speculative model
"speculative_model"
:
"JackFram/llama-160m"
,
...
...
tests/spec_decode/test_multi_step_worker.py
View file @
6d2051cc
...
...
@@ -173,7 +173,6 @@ def test_same_output_for_multi_step():
block_size
,
num_gpu_blocks
,
seed
,
model_runner_cls
=
TP1DraftModelRunner
,
)
worker
=
create_worker
(
...
...
@@ -673,7 +672,10 @@ def test_use_draft_model_runner_advance_step():
worker
.
model_runner
.
_gpu_advance_step
.
side_effect
=
ValueError
(
exception_secret
)
seq_group_metadata_list
,
_
,
_
=
create_batch
(
batch_size
,
k
)
seq_group_metadata_list
,
_
,
_
=
create_batch
(
batch_size
,
k
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
)
# Fallback (should not call) when num_steps=1.
execute_model_req
=
ExecuteModelRequest
(
...
...
tests/spec_decode/test_scorer.py
0 → 100644
View file @
6d2051cc
import
random
from
typing
import
List
import
pytest
import
torch
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.spec_decode.batch_expansion
import
BatchExpansionTop1Scorer
from
vllm.spec_decode.interfaces
import
SpeculativeProposals
,
SpeculativeScores
from
vllm.spec_decode.mqa_scorer
import
MQAScorer
from
vllm.worker.worker
import
Worker
from
.utils
import
create_batch
,
create_worker
def
create_proposal
(
propose_lens
:
List
[
int
],
vocab_size
:
int
,
device
:
str
)
->
SpeculativeProposals
:
batch_size
=
len
(
propose_lens
)
max_propose_len
=
max
(
propose_lens
)
proposal_probs
=
torch
.
rand
((
batch_size
,
max_propose_len
,
vocab_size
),
device
=
device
)
proposal_token_ids
=
torch
.
full
((
batch_size
,
max_propose_len
),
fill_value
=-
1
,
device
=
device
)
for
i
in
range
(
batch_size
):
proposal_token_ids
[
i
][:
propose_lens
[
i
]]
=
torch
.
argmax
(
proposal_probs
[
i
][:
propose_lens
[
i
]],
dim
=-
1
)
propose_lens
=
torch
.
tensor
(
propose_lens
,
device
=
device
)
return
SpeculativeProposals
(
proposal_token_ids
,
proposal_probs
,
propose_lens
)
def
assert_score_equal
(
score1
:
SpeculativeScores
,
score2
:
SpeculativeScores
)
->
None
:
assert
torch
.
allclose
(
score1
.
probs
,
score2
.
probs
)
assert
torch
.
allclose
(
score1
.
logprobs
,
score2
.
logprobs
)
assert
torch
.
equal
(
score1
.
token_ids
,
score2
.
token_ids
),
f
"
{
score1
.
token_ids
}
,
{
score2
.
token_ids
}
"
@
pytest
.
mark
.
parametrize
(
'model_name'
,
[
'facebook/opt-125m'
])
@
pytest
.
mark
.
parametrize
(
'batch_size'
,
[
1
,
2
,
4
,
8
,
16
])
@
pytest
.
mark
.
parametrize
(
'max_propose_len'
,
[
1
,
3
,
5
])
@
pytest
.
mark
.
parametrize
(
'mixed_propose_len'
,
[
True
])
@
pytest
.
mark
.
parametrize
(
'device'
,
[
'cuda'
])
def
test_scorer
(
model_name
:
str
,
batch_size
:
int
,
max_propose_len
:
int
,
mixed_propose_len
:
bool
,
device
:
str
)
->
None
:
"""
Compare the batch expansion scorer and mqa scorer return the same score.
We test for both queries with the same propose length and different
propose length.
"""
seed
=
0
block_size
=
32
num_gpu_blocks
=
2048
//
block_size
scorer_worker
=
create_worker
(
Worker
,
model_name
,
block_size
,
num_gpu_blocks
,
seed
)
scorer_worker
.
model_runner
.
model
.
sampler
.
include_gpu_probs_tensor
=
True
scorer_worker
.
model_runner
.
model
.
sampler
.
\
should_modify_greedy_probs_inplace
=
True
vocab_size
=
scorer_worker
.
vocab_size
if
not
mixed_propose_len
:
propose_lens
=
[
max_propose_len
]
*
batch_size
else
:
non_zero_cnt
=
random
.
randint
(
0
,
batch_size
)
propose_lens
=
[
max_propose_len
]
*
non_zero_cnt
+
[
0
]
*
(
batch_size
-
non_zero_cnt
)
random
.
shuffle
(
propose_lens
)
proposals
=
create_proposal
(
propose_lens
,
vocab_size
,
device
)
seq_group_metadatalist
,
_
,
_
=
create_batch
(
batch_size
,
max_propose_len
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
)
requests
=
ExecuteModelRequest
(
seq_group_metadatalist
,
num_lookahead_slots
=
max_propose_len
)
batch_expansion_scorer
=
BatchExpansionTop1Scorer
(
scorer_worker
,
device
,
vocab_size
)
batch_expansion_score
=
batch_expansion_scorer
.
score_proposals
(
requests
,
proposals
)
mqa_scorer
=
MQAScorer
(
scorer_worker
,
device
,
vocab_size
)
mqa_score
=
mqa_scorer
.
score_proposals
(
requests
,
proposals
)
assert_score_equal
(
batch_expansion_score
,
mqa_score
)
tests/spec_decode/test_spec_decode_worker.py
View file @
6d2051cc
...
...
@@ -63,10 +63,10 @@ def test_correctly_calls_draft_model(k: int, batch_size: int,
@
pytest
.
mark
.
parametrize
(
"acceptance_sampler_method"
,
[
"rejection_sampler"
,
"typical_acceptance_sampler"
])
@
torch
.
inference_mode
()
def
test_correctly_calls_target_model
(
k
:
int
,
batch_size
:
int
,
acceptance_sampler_method
:
str
):
def
test_
batch_expansion_
correctly_calls_target_model
(
k
:
int
,
batch_size
:
int
,
acceptance_sampler_method
:
str
):
"""Verify SpecDecodeWorker calls the target model with correct
inputs. Everything else is mocked out.
inputs
with batch expansion
. Everything else is mocked out.
"""
draft_worker
=
mock_worker
(
cls
=
MultiStepWorker
,
use_spec
=
False
)
target_worker
=
mock_worker
(
use_spec
=
False
)
...
...
@@ -82,7 +82,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int,
target_worker
,
mock_spec_decode_sampler
(
acceptance_sampler_method
),
disable_logprobs
=
False
,
metrics_collector
=
metrics_collector
)
metrics_collector
=
metrics_collector
,
disable_mqa_scorer
=
True
)
worker
.
init_device
()
vocab_size
=
32_000
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment