Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
af7f4372
Commit
af7f4372
authored
Sep 03, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1
parents
5e19cdef
09c77926
Changes
465
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1005 additions
and
68 deletions
+1005
-68
tests/multi_step/__init__.py
tests/multi_step/__init__.py
+0
-0
tests/multi_step/test_correctness.py
tests/multi_step/test_correctness.py
+85
-0
tests/multimodal/test_mapper.py
tests/multimodal/test_mapper.py
+77
-5
tests/plugins/vllm_add_dummy_model/setup.py
tests/plugins/vllm_add_dummy_model/setup.py
+9
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+26
-0
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+7
-0
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+12
-8
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+68
-0
tests/quantization/test_experts_int8.py
tests/quantization/test_experts_int8.py
+28
-0
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+20
-7
tests/quantization/test_lm_head.py
tests/quantization/test_lm_head.py
+4
-2
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+7
-7
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+82
-9
tests/samplers/test_typical_acceptance_sampler.py
tests/samplers/test_typical_acceptance_sampler.py
+9
-9
tests/spec_decode/e2e/conftest.py
tests/spec_decode/e2e/conftest.py
+22
-9
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+268
-0
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+48
-0
tests/spec_decode/e2e/test_logprobs.py
tests/spec_decode/e2e/test_logprobs.py
+75
-0
tests/spec_decode/e2e/test_medusa_correctness.py
tests/spec_decode/e2e/test_medusa_correctness.py
+56
-12
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+102
-0
No files found.
Too many changes to show.
To preserve performance only
465 of 465+
files are displayed.
Plain diff
Email patch
tests/multi_step/__init__.py
0 → 100644
View file @
af7f4372
tests/multi_step/test_correctness.py
0 → 100644
View file @
af7f4372
# Test the AsyncLLMEngine with multi-step-decoding
from
typing
import
List
import
pytest
from
..utils
import
RemoteOpenAIServer
MODELS
=
[
"JackFram/llama-160m"
,
]
NUM_SCHEDULER_STEPS
=
[
8
]
# Multi-step decoding steps
NUM_PROMPTS
=
[
10
]
DEFAULT_SERVER_ARGS
:
List
[
str
]
=
[
"--disable-log-requests"
,
"--use-v2-block-manager"
,
"--worker-use-ray"
,
"--gpu-memory-utilization"
,
"0.85"
,
"--swap-space"
,
"16"
,
]
async
def
completions_with_server_args
(
prompts
:
List
[
str
],
model_name
:
str
,
server_cli_args
:
List
[
str
]):
outputs
=
None
with
RemoteOpenAIServer
(
model_name
,
server_cli_args
)
as
server
:
client
=
server
.
get_async_client
()
outputs
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
temperature
=
0
,
stream
=
False
,
max_tokens
=
5
)
assert
outputs
is
not
None
return
outputs
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
((
"tp_size, pp_size"
),
[
(
1
,
1
),
(
2
,
2
),
])
@
pytest
.
mark
.
parametrize
(
"eager_mode"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
@
pytest
.
mark
.
parametrize
(
"num_prompts"
,
NUM_PROMPTS
)
@
pytest
.
mark
.
asyncio
async
def
test_multi_step
(
example_prompts
,
model
:
str
,
tp_size
:
int
,
pp_size
:
int
,
eager_mode
:
int
,
num_scheduler_steps
:
int
,
num_prompts
:
int
):
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
server_args
=
DEFAULT_SERVER_ARGS
+
[
"--enforce-eager"
]
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
if
eager_mode
:
ms_server_args
.
append
(
"--enforce-eager"
)
distributed_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
]
ref_completions
=
await
completions_with_server_args
(
prompts
,
model
,
server_args
+
distributed_args
)
test_completions
=
await
completions_with_server_args
(
prompts
,
model
,
ms_server_args
+
distributed_args
)
def
get_text_generations
(
completions
):
return
[
x
.
text
for
x
in
completions
.
choices
]
ref_generations
=
get_text_generations
(
ref_completions
)
test_generations
=
get_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
tests/multimodal/test_mapper.py
View file @
af7f4372
from
contextlib
import
nullcontext
import
numpy
as
np
import
pytest
from
transformers
import
CLIPImageProcessor
,
LlavaNextImageProcessor
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
M
ULTIMODAL_REGISTRY
from
vllm.multimodal
import
M
ultiModalRegistry
from
vllm.multimodal.utils
import
rescale_image_size
@
pytest
.
fixture
def
mm_registry
():
return
MultiModalRegistry
()
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
@
pytest
.
mark
.
parametrize
(
"size_factor"
,
[
0.25
,
0.5
,
1.0
])
def
test_clip_image_processor
(
image_assets
,
dtype
,
size_factor
):
def
test_clip_image_processor
(
image_assets
,
mm_registry
,
dtype
,
size_factor
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
hf_processor
=
CLIPImageProcessor
.
from_pretrained
(
MODEL_NAME
)
...
...
@@ -23,8 +30,11 @@ def test_clip_image_processor(image_assets, dtype, size_factor):
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
for
asset
in
image_assets
:
image
=
rescale_image_size
(
asset
.
pil_image
,
size_factor
)
...
...
@@ -32,7 +42,7 @@ def test_clip_image_processor(image_assets, dtype, size_factor):
image
,
return_tensors
=
"pt"
,
)
vllm_result
=
MULTIMODAL_REGISTRY
.
map_input
(
vllm_result
=
mm_registry
.
map_input
(
model_config
,
{
"image"
:
image
},
)
...
...
@@ -48,7 +58,8 @@ def test_clip_image_processor(image_assets, dtype, size_factor):
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
@
pytest
.
mark
.
parametrize
(
"size_factor"
,
[
0.25
,
0.5
,
1.0
])
def
test_llava_next_image_processor
(
image_assets
,
dtype
,
size_factor
):
def
test_llava_next_image_processor
(
image_assets
,
mm_registry
,
dtype
,
size_factor
):
MODEL_NAME
=
"llava-hf/llava-v1.6-vicuna-7b-hf"
hf_processor
=
LlavaNextImageProcessor
.
from_pretrained
(
MODEL_NAME
)
...
...
@@ -62,8 +73,11 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
for
asset
in
image_assets
:
image
=
rescale_image_size
(
asset
.
pil_image
,
size_factor
)
...
...
@@ -71,7 +85,7 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
image
,
return_tensors
=
"pt"
,
)
vllm_result
=
MULTIMODAL_REGISTRY
.
map_input
(
vllm_result
=
mm_registry
.
map_input
(
model_config
,
{
"image"
:
image
},
)
...
...
@@ -83,3 +97,61 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
assert
hf_arr
.
shape
==
vllm_arr
.
shape
,
f
"Failed for key=
{
key
}
"
assert
np
.
allclose
(
hf_arr
,
vllm_arr
),
f
"Failed for key=
{
key
}
"
@
pytest
.
mark
.
parametrize
(
(
"num_images"
,
"limit"
,
"is_valid"
),
[(
0
,
0
,
True
),
(
0
,
1
,
True
),
(
1
,
0
,
False
),
(
1
,
1
,
True
),
(
1
,
2
,
True
),
(
2
,
1
,
False
),
(
2
,
2
,
True
)],
)
def
test_mm_limits
(
image_assets
,
mm_registry
,
num_images
,
limit
,
is_valid
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"half"
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
limit
},
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
image
=
image_assets
[
0
].
pil_image
if
num_images
==
0
:
mm_inputs
=
{}
elif
num_images
==
1
:
mm_inputs
=
{
"image"
:
image
}
else
:
mm_inputs
=
{
"image"
:
[
image
]
*
num_images
}
with
nullcontext
()
if
is_valid
else
pytest
.
raises
(
ValueError
):
mm_registry
.
map_input
(
model_config
,
mm_inputs
)
# NOTE: We don't test zero images since the HF processor doesn't support it
@
pytest
.
mark
.
parametrize
(
"num_images"
,
[
1
,
2
])
def
test_image_mapper_multi
(
image_assets
,
mm_registry
,
num_images
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"half"
,
revision
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_images
},
)
mm_registry
.
init_mm_limits_per_prompt
(
model_config
)
image
=
image_assets
[
0
].
pil_image
mm_inputs
=
{
"image"
:
[
image
]
*
num_images
}
mapped_inputs
=
mm_registry
.
map_input
(
model_config
,
mm_inputs
)
assert
len
(
mapped_inputs
[
"pixel_values"
])
==
num_images
tests/plugins/vllm_add_dummy_model/setup.py
0 → 100644
View file @
af7f4372
from
setuptools
import
setup
setup
(
name
=
'vllm_add_dummy_model'
,
version
=
'0.1'
,
packages
=
[
'vllm_add_dummy_model'
],
entry_points
=
{
'vllm.general_plugins'
:
[
"register_dummy_model = vllm_add_dummy_model:register"
]
})
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
0 → 100644
View file @
af7f4372
from
typing
import
Optional
import
torch
from
vllm
import
ModelRegistry
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
def
register
():
# register our dummy model
if
"MyOPTForCausalLM"
not
in
ModelRegistry
.
get_supported_archs
():
ModelRegistry
.
register_model
(
"MyOPTForCausalLM"
,
MyOPTForCausalLM
)
tests/prefix_caching/test_prefix_caching.py
View file @
af7f4372
...
...
@@ -34,6 +34,9 @@ def test_block_allocator(
assert
(
first_block
==
second_block
)
assert
(
second_block
.
ref_count
==
2
)
# Check metric: 1 hit of 2 queries
assert
block_allocator
.
get_prefix_cache_hit_rate
()
==
0.5
# Free the first_block and confirm that the ref_count is correctly
# decremented on the second block
block_allocator
.
free
(
first_block
)
...
...
@@ -48,6 +51,10 @@ def test_block_allocator(
assert
(
first_block
==
second_block
)
assert
(
first_block
.
block_hash
==
block_hash
)
# Allocate one more time to get 3/4 hit rate for easy checking
block_allocator
.
allocate
(
block_hash
,
0
)
assert
block_allocator
.
get_prefix_cache_hit_rate
()
==
0.75
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
16
])
def
test_eviction
(
num_blocks
:
int
,
):
...
...
tests/quantization/test_compressed_tensors.py
View file @
af7f4372
...
...
@@ -9,7 +9,7 @@ import torch
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A8Int8
,
CompressedTensorsWNA16
)
CompressedTensorsW8A16Fp8
,
CompressedTensorsWNA16
)
from
vllm.model_executor.layers.quantization.compressed_tensors.utils
import
(
QuantizationType
)
...
...
@@ -109,7 +109,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
assert
qkv_proj
.
weight_packed
.
dtype
is
torch
.
int32
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float16
assert
qkv_proj
.
weight_packed
.
pack_factor
==
pack_factor
assert
qkv_proj
.
scheme
.
pack_factor
==
pack_factor
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
assert
output
...
...
@@ -140,12 +140,16 @@ def test_compressed_tensors_fp8(vllm_runner):
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8Fp8
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
float8_e4m3fn
assert
isinstance
(
qkv_proj
.
scheme
,
(
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A16Fp8
))
assert
qkv_proj
.
input_scale
.
dtype
is
torch
.
float32
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float32
# should be scalars after processing
if
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8Fp8
):
assert
len
(
qkv_proj
.
input_scale
.
shape
)
==
0
assert
qkv_proj
.
weight
.
dtype
is
torch
.
float8_e4m3fn
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float32
assert
len
(
qkv_proj
.
weight_scale
.
shape
)
==
0
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
...
...
tests/quantization/test_cpu_offload.py
0 → 100644
View file @
af7f4372
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
compare_two_settings
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
],
max_wait_seconds
=
480
)
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"2"
],
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_gptq
():
# Test GPTQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test GPTQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[
"--quantization"
,
"gptq"
],
[
"--quantization"
,
"gptq"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
),
reason
=
"awq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
# Test AWQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test AWQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[
"--quantization"
,
"awq"
],
[
"--quantization"
,
"awq"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
tests/quantization/test_experts_int8.py
0 → 100644
View file @
af7f4372
# flake8: noqa
"""Tests experts_int8 quantization startup and generation,
doesn't test correctness
"""
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
MODELS
=
[
"ai21labs/Jamba-tiny-random"
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"experts_int8"
),
reason
=
"ExpertsInt8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
def
test_model_experts_int8_startup
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
quantization
=
"experts_int8"
)
as
vllm_model
:
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tests/quantization/test_fp8.py
View file @
af7f4372
...
...
@@ -9,6 +9,7 @@ from tests.quantization.utils import is_quant_method_supported
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.fp8
import
(
Fp8KVCacheMethod
,
Fp8LinearMethod
)
from
vllm.platforms
import
current_platform
MODELS
=
[
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
...
...
@@ -20,7 +21,12 @@ MODELS = [
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
MODELS
)
def
test_model_load_and_run
(
vllm_runner
,
model_id
:
str
):
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
def
test_model_load_and_run
(
vllm_runner
,
model_id
:
str
,
force_marlin
:
bool
,
monkeypatch
)
->
None
:
if
force_marlin
:
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_FP8_MARLIN"
,
"1"
)
with
vllm_runner
(
model_id
)
as
llm
:
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
...
...
@@ -61,7 +67,12 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
def
test_load_fp16_model
(
vllm_runner
,
kv_cache_dtype
:
str
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
def
test_load_fp16_model
(
vllm_runner
,
kv_cache_dtype
:
str
,
force_marlin
:
bool
,
monkeypatch
)
->
None
:
if
force_marlin
:
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_FP8_MARLIN"
,
"1"
)
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
,
kv_cache_dtype
=
kv_cache_dtype
)
as
llm
:
...
...
@@ -75,9 +86,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
current_platform
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
if
capability
>=
89
:
if
capability
>=
89
and
not
force_marlin
:
# For GPUs with hardware support, we keep weights in fp8
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
else
:
...
...
@@ -116,16 +127,18 @@ def test_scaled_fp8_quant(dtype) -> None:
# Reference dynamic quantizaton
y
=
quantize_ref
(
x
,
inv_scale
)
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
torch
.
testing
.
assert_close
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
# Static quantization
y
,
_
=
ops
.
scaled_fp8_quant
(
x
,
inv_scale
)
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
torch
.
testing
.
assert_close
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
# Padding
y
,
_
=
ops
.
scaled_fp8_quant
(
x
,
inv_scale
,
num_token_padding
=
17
)
assert
y
.
shape
[
0
]
==
17
assert
torch
.
all
close
(
torch
.
testing
.
assert_
close
(
ref_y
,
per_tensor_dequantize
(
torch
.
narrow
(
y
,
0
,
0
,
x
.
shape
[
0
]),
inv_scale
,
dtype
))
tests/quantization/test_lm_head.py
View file @
af7f4372
...
...
@@ -7,11 +7,12 @@ from typing import Tuple
import
pytest
import
torch
from
vllm.model_executor.layers.linear
import
UnquantizedLinearMethod
from
vllm.model_executor.layers.quantization.gptq
import
GPTQLinearMethod
from
vllm.model_executor.layers.quantization.gptq_marlin
import
(
GPTQMarlinLinearMethod
)
from
vllm.model_executor.layers.quantization.marlin
import
MarlinLinearMethod
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
UnquantizedEmbeddingMethod
)
PROMPT
=
"On the surface of Mars, we found"
...
...
@@ -37,7 +38,8 @@ def test_lm_head(
lm_head_layer
.
linear_method
,
(
GPTQLinearMethod
,
GPTQMarlinLinearMethod
,
MarlinLinearMethod
))
else
:
assert
isinstance
(
lm_head_layer
.
linear_method
,
UnquantizedLinearMethod
)
assert
isinstance
(
lm_head_layer
.
linear_method
,
UnquantizedEmbeddingMethod
)
print
(
vllm_model
.
generate_greedy
(
prompts
=
[
"Hello my name is"
],
...
...
tests/samplers/test_rejection_sampler.py
View file @
af7f4372
...
...
@@ -25,7 +25,7 @@ def mock_causal_accepted_tensor(
accepted
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
<=
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
batch_size
,
k
))
.
to
(
device
=
"cuda"
)
batch_size
,
k
))
# Sprinkle accepted values after the contiguous initial accepted values.
# This replicates the behavior of rejection sampling, which may "accept"
...
...
@@ -33,7 +33,7 @@ def mock_causal_accepted_tensor(
sprinkle_candidates
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
>
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
batch_size
,
k
)
+
1
)
sprinkle
=
torch
.
rand
(
batch_size
,
k
,
device
=
"cuda"
)
>
0.5
sprinkle
=
torch
.
rand
(
batch_size
,
k
)
>
0.5
accepted
[
sprinkle_candidates
]
=
sprinkle
[
sprinkle_candidates
]
return
accepted
...
...
@@ -86,7 +86,7 @@ def test_correct_output_format(which_tokens_accepted: str,
rejection_sampler
=
RejectionSampler
(
disable_bonus_tokens
=
disable_bonus_tokens
)
rejection_sampler
.
init_gpu_tensors
(
rank
=
0
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
output_token_ids
=
rejection_sampler
.
_create_output
(
# pylint: disable=protected-access
accepted
,
recovered_token_ids
,
...
...
@@ -138,7 +138,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
device
:
str
):
torch
.
set_default_device
(
device
)
rejection_sampler
=
RejectionSampler
()
rejection_sampler
.
init_gpu_tensors
(
rank
=
0
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
...
...
@@ -167,7 +167,7 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
device
:
str
):
torch
.
set_default_device
(
device
)
rejection_sampler
=
RejectionSampler
()
rejection_sampler
.
init_gpu_tensors
(
rank
=
0
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
...
...
@@ -211,7 +211,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
torch
.
set_default_device
(
device
)
rejection_sampler
=
RejectionSampler
(
strict_mode
=
True
)
rejection_sampler
.
init_gpu_tensors
(
rank
=
0
)
rejection_sampler
.
init_gpu_tensors
(
device
=
device
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
...
...
@@ -339,7 +339,7 @@ class _CorrectnessTestHelper:
self
.
vocab_size
=
vocab_size
self
.
vocab_range
=
(
0
,
vocab_size
)
self
.
rejection_sampler
.
init_gpu_tensors
(
rank
=
0
)
self
.
rejection_sampler
.
init_gpu_tensors
(
device
=
0
)
# Keep test simple, use k=1
self
.
k
=
1
...
...
tests/samplers/test_sampler.py
View file @
af7f4372
import
itertools
import
random
from
array
import
array
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
unittest.mock
import
patch
from
unittest.mock
import
Mock
,
patch
import
pytest
import
torch
from
transformers
import
GenerationConfig
,
GenerationMixin
import
vllm.envs
as
envs
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
)
from
vllm.utils
import
Counter
,
is_pin_memory_available
...
...
@@ -56,7 +59,9 @@ def _do_sample(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
seq_data
=
{
0
:
SequenceData
(
array
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
[
1
,
2
,
3
]))
},
sampling_params
=
sampling_params
,
block_tables
=
{
0
:
[
1
]},
))
...
...
@@ -201,7 +206,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
def
create_sequence_data
(
num_input
=
3
,
num_generated
=
0
):
seq_data
=
SequenceData
(
random
.
choices
(
range
(
0
,
VOCAB_SIZE
),
k
=
num_input
))
array
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
random
.
choices
(
range
(
0
,
VOCAB_SIZE
),
k
=
num_input
)))
if
num_generated
>
0
:
seq_data
.
output_token_ids
=
random
.
choices
(
range
(
0
,
VOCAB_SIZE
),
k
=
num_generated
)
...
...
@@ -504,7 +510,9 @@ def test_sampler_mixed(seed: int, device: str):
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
seq_data
=
{
0
:
SequenceData
(
array
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
[
1
,
2
,
3
]))
},
sampling_params
=
sampling_params
,
block_tables
=
{
0
:
[
1
]},
))
...
...
@@ -600,7 +608,9 @@ def test_sampler_top_k_top_p(seed: int, device: str):
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
seq_data
=
{
0
:
SequenceData
(
array
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
[
1
,
2
,
3
]))
},
sampling_params
=
SamplingParams
(
temperature
=
1
,
top_k
=
top_k
,
...
...
@@ -625,17 +635,51 @@ def test_sampler_top_k_top_p(seed: int, device: str):
return
([[
prob
.
topk
(
1
,
dim
=-
1
).
indices
.
tolist
(),
[
0
]]
for
prob
in
probs
],
None
)
with
patch
(
"vllm.model_executor.layers.sampler._sample"
,
mock_sample
):
# top-k and top-p is only calculated when flashinfer kernel is not available
with
patch
(
"vllm.model_executor.layers.sampler._sample"
,
mock_sample
),
\
patch
(
"vllm.model_executor.layers.sampler."
"flashinfer_top_k_top_p_sampling"
,
None
):
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
assert
sample_probs
is
not
None
hf_probs
=
warpers
(
torch
.
zeros_like
(
fake_logits
),
fake_logits
.
clone
())
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
assert
torch
.
all
close
(
hf_probs
,
sample_probs
,
atol
=
1e-5
)
torch
.
testing
.
assert_
close
(
hf_probs
,
sample_probs
,
rtol
=
0.0
,
atol
=
1e-5
)
assert
torch
.
equal
(
hf_probs
.
eq
(
0
),
sample_probs
.
eq
(
0
))
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_flashinfer_fallback
(
seed
:
int
,
device
:
str
):
if
not
envs
.
VLLM_USE_FLASHINFER_SAMPLER
:
pytest
.
skip
(
"Flashinfer sampler is disabled"
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
_
,
fake_logits
,
sampler
=
_prepare_test
(
batch_size
)
def
failing_flashinfer_sampling
(
*
_args
,
**
_kwargs
):
return
None
,
torch
.
zeros
(
batch_size
,
device
=
device
,
dtype
=
torch
.
int32
)
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
n
=
random
.
randint
(
1
,
10
),
seed
=
random
.
randint
(
0
,
10000
),
)
sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
sampling_params
,
device
)
with
patch
(
"vllm.model_executor.layers.sampler."
"flashinfer_top_k_top_p_sampling"
,
failing_flashinfer_sampling
):
fallback_sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
sampling_params
,
device
)
assert
sampler_output
==
fallback_sampler_output
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_repetition_penalty_mixed
(
device
:
str
):
...
...
@@ -650,7 +694,11 @@ def test_sampler_repetition_penalty_mixed(device: str):
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
seq_data
=
{
0
:
SequenceData
(
array
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
[
1
,
2
,
3
]))
},
sampling_params
=
sampling_params
[
i
],
block_tables
=
{
0
:
[
1
]},
))
...
...
@@ -703,3 +751,28 @@ def test_sampler_repetition_penalty_mixed(device: str):
assert
tokens1
[
0
]
==
tokens2
[
1
]
assert
tokens1
[
1
]
==
tokens2
[
0
]
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_sampler_include_gpu_probs_tensor
(
device
:
str
):
set_random_seed
(
42
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
_
,
fake_logits
,
sampler
=
_prepare_test
(
batch_size
)
sampler
.
include_gpu_probs_tensor
=
True
sampler
.
should_modify_greedy_probs_inplace
=
False
sampling_params
=
SamplingParams
(
temperature
=
0
)
mock_inplace
=
Mock
()
with
patch
(
"vllm.model_executor.layers.sampler._modify_greedy_probs_inplace"
,
mock_inplace
):
sampler_output
=
_do_sample
(
batch_size
,
fake_logits
,
sampler
,
sampling_params
,
device
)
mock_inplace
.
assert_not_called
()
assert
sampler_output
.
sampled_token_probs
is
not
None
assert
sampler_output
.
logprobs
is
not
None
assert
sampler_output
.
sampled_token_ids
is
not
None
tests/samplers/test_typical_acceptance_sampler.py
View file @
af7f4372
...
...
@@ -78,7 +78,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
"""
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
()
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
...
...
@@ -111,7 +111,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
vocab_size
=
30_000
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
...
...
@@ -171,7 +171,7 @@ def test_uniform_target_distribution_accepts_all_tokens(
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
,
disable_bonus_tokens
=
disable_bonus_tokens
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
...
...
@@ -225,7 +225,7 @@ def test_temperature_zero_target_distribution(seed: int,
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
,
disable_bonus_tokens
=
disable_bonus_tokens
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# Simulate temperature 0 probability distribution for target probabilities
# and create target probabilities such that only 1 token id has
# probability 1.0
...
...
@@ -285,7 +285,7 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
,
disable_bonus_tokens
=
disable_bonus_tokens
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# For sequences 0 and 2 set the distribution to a temperature
# zero distribution. For sequences 1 and 3 set it to a uniform
# distribution.
...
...
@@ -352,7 +352,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
,
disable_bonus_tokens
=
disable_bonus_tokens
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# Create a temperature zero target probability distribution and ensure
# all draft token ids correspond to the tokens with 1.0 probability.
# Verify that all of them are accepted.
...
...
@@ -414,7 +414,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
,
disable_bonus_tokens
=
disable_bonus_tokens
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
# Simulate temperature 0 probability distribution for target
# probabilities and create target probabilities such that only 1 token
# id has probability 1.0 and others have a very low probability of
...
...
@@ -447,7 +447,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
disable_bonus_tokens
=
disable_bonus_tokens
,
posterior_threshold
=
0.0
,
posterior_alpha
=
0.0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
output_token_ids
=
typical_acceptance_sampler
(
target_probs
,
bonus_token_ids
,
...
...
@@ -485,7 +485,7 @@ def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
torch
.
set_default_device
(
device
)
typical_acceptance_sampler
=
get_acceptance_sampler
(
strict_mode
=
True
,
disable_bonus_tokens
=
disable_bonus_tokens
)
typical_acceptance_sampler
.
init_gpu_tensors
(
rank
=
0
)
typical_acceptance_sampler
.
init_gpu_tensors
(
device
=
device
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
expected_replacement_tokens
=
-
torch
.
ones
(
(
batch_size
,
k
),
dtype
=
torch
.
long
)
...
...
tests/spec_decode/e2e/conftest.py
View file @
af7f4372
import
asyncio
import
os
from
itertools
import
cycle
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
...
...
@@ -56,6 +57,11 @@ class AsyncLLM:
)
->
None
:
if
"disable_log_stats"
not
in
kwargs
:
kwargs
[
"disable_log_stats"
]
=
True
# Needed to engine_use_ray works as a deprecated feature,
# otherwise the following constructor will raise an exception
os
.
environ
[
"VLLM_ALLOW_ENGINE_USE_RAY"
]
=
"1"
engine_args
=
AsyncEngineArgs
(
model
=
model
,
tokenizer
=
tokenizer
,
...
...
@@ -282,7 +288,8 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
ensure_all_accepted
=
ensure_all_accepted
)
def
run_equality_correctness_test
(
baseline_llm_generator
,
def
run_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
,
...
...
@@ -290,7 +297,8 @@ def run_equality_correctness_test(baseline_llm_generator,
temperature
:
float
,
seeded
:
bool
,
print_tokens
:
bool
=
False
,
ensure_all_accepted
:
bool
=
False
):
ensure_all_accepted
:
bool
=
False
,
expected_acceptance_rate
:
Optional
[
float
]
=
None
):
"""Helper method that compares the outputs of both the baseline LLM and
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
the same when temperature is zero (or when temperature is > 0 and seeded).
...
...
@@ -351,5 +359,10 @@ def run_equality_correctness_test(baseline_llm_generator,
print
(
f
'
{
i
=
}
{
spec_token_ids
=
}
'
)
assert
baseline_token_ids
==
spec_token_ids
print
(
f
'
{
acceptance_rate
=
}
'
)
if
ensure_all_accepted
:
assert
acceptance_rate
==
1.0
if
expected_acceptance_rate
is
not
None
:
assert
acceptance_rate
>=
expected_acceptance_rate
-
1e-2
tests/spec_decode/e2e/test_eagle_correctness.py
0 → 100644
View file @
af7f4372
"""This docstring details important information on the testing methodology.
Most of the tests rely on "greedy equality", where we expect the output of
speculative decoding on a sequence to exactly match the output of normal non-
speculative decoding.
Since speculative decoding with rejection sampling guarantees that the output
distribution matches the target model's output distribution (up to hardware
numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
equality.
However, we still need to verify below scenario could be passed:
* Batch size 1 greedy equality
* Batch size >1 greedy equality
* Test greedy equality under preemption
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, EAGLE would not break the
correctess for the target model outputs.
"""
import
pytest
from
.conftest
import
run_greedy_equality_correctness_test
# main model
MAIN_MODEL
=
"JackFram/llama-68m"
# speculative model
SPEC_MODEL
=
"abhigoyal/vllm-eagle-llama-68m-random"
# max. number of speculative tokens: this corresponds to
# num_heads in the config.json of the speculator model.
MAX_SPEC_TOKENS
=
4
# precision
PRECISION
=
"float32"
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_e2e_greedy_correctness
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify greedy equality with different batch size."""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"enforce_eager"
:
False
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_e2e_greedy_correctness_cuda_graph
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify greedy equality with cuda graph enabled and different
batch sizes."""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use small output len for fast test.
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_e2e_greedy_correctness_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify greedy equality, even when some sequences are preempted mid-
generation.
"""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
k
,
}
# Try a range of num. speculative tokens
for
k
in
range
(
1
,
1
+
MAX_SPEC_TOKENS
)
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_different_k
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify that eagle speculative decoding produces exact equality
to without spec decode with different values of num_speculative_tokens.
"""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"speculative_disable_by_batch_size"
:
4
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_eagle_disable_queue
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify that eagle speculative decoding produces exact equality
to without spec decode when speculation is disabled for large
batch sizes.
"""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
if
__name__
==
"__main__"
:
import
pytest
pytest
.
main
([
__file__
])
tests/spec_decode/e2e/test_integration.py
View file @
af7f4372
...
...
@@ -42,3 +42,51 @@ def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
max_output_len
=
output_len
,
force_output_len
=
True
,
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
"speculative_model"
:
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"num_speculative_tokens"
:
5
,
},
])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
# Explicitly specify draft model quantization
{
"speculative_model_quantization"
:
"gptq"
,
},
# Explicitly specify GPTQ-based draft model to use marlin quantization
{
"speculative_model_quantization"
:
"marlin"
,
},
# Not explicitly specify draft model quantization
{
"speculative_model_quantization"
:
None
,
},
])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_speculative_model_quantization_config
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
):
"""Verify spec decode works well with draft model quantization configs.
"""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
32
,
force_output_len
=
True
)
tests/spec_decode/e2e/test_logprobs.py
View file @
af7f4372
...
...
@@ -343,3 +343,78 @@ def run_greedy_logprobs_correctness_test(baseline_llm_generator,
b
=
baseline_rank_to_logprob
[
rank
],
abs_tol
=
1e-1
,
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
"max_logprobs"
:
6
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"disable_logprobs_during_spec_decoding"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_logprobs_disabled
(
baseline_llm_generator
,
test_llm_generator
):
"""Check the behavior when logprobs are disabled.
Token choices should match with the base model.
"""
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
"San Francisco is know for its"
,
"Facebook was created in 2004 by"
,
"Curious George is a"
,
"Python 3.11 brings improvements to its"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
4
))]
sampling_params
=
SamplingParams
(
# Use smaller output len for fast test
max_tokens
=
7
,
ignore_eos
=
True
,
temperature
=
0.0
,
logprobs
=
2
,
)
spec_batch_logprobs
=
get_logprobs_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
baseline_batch_logprobs
=
get_logprobs_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
assert
len
(
baseline_batch_logprobs
)
==
len
(
prompts
)
assert
len
(
spec_batch_logprobs
)
==
len
(
prompts
)
# For each sequence in the batch.
for
_
,
(
baseline_logprobs
,
spec_logprobs
)
in
enumerate
(
zip
(
baseline_batch_logprobs
,
spec_batch_logprobs
)):
assert
len
(
spec_logprobs
)
==
len
(
baseline_logprobs
)
# For each generated position of the sequence.
for
_
,
(
spec_pos_logprobs
,
baseline_pos_logprobs
)
in
enumerate
(
zip
(
spec_logprobs
,
baseline_logprobs
)):
assert
len
(
spec_pos_logprobs
)
==
1
spec_top_token_id
=
list
(
spec_pos_logprobs
)[
0
]
spec_top_logprob
=
spec_pos_logprobs
[
spec_top_token_id
]
assert
spec_top_logprob
.
logprob
==
0.0
assert
spec_top_logprob
.
rank
==
-
1
# check that the chosen token matches the base model
baseline_logprob
=
baseline_pos_logprobs
[
spec_top_token_id
]
assert
baseline_logprob
.
rank
==
1
assert
spec_top_logprob
.
decoded_token
\
==
baseline_logprob
.
decoded_token
tests/spec_decode/e2e/test_medusa_correctness.py
View file @
af7f4372
...
...
@@ -70,8 +70,9 @@ PRECISION = "float32"
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_mlp_e2e_greedy_correctness
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
def
test_medusa_e2e_greedy_correctness
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify greedy equality with different batch size."""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
...
...
@@ -80,6 +81,49 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"enforce_eager"
:
False
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
SPEC_MODEL
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_medusa_e2e_greedy_correctness_cuda_graph
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify greedy equality with cuda graph enabled and different
batch sizes."""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
...
...
@@ -116,7 +160,7 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_m
lp
_e2e_greedy_correctness_with_preemption
(
baseline_llm_generator
,
def
test_m
edusa
_e2e_greedy_correctness_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
...
...
@@ -165,9 +209,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_m
lp
_different_k
(
baseline_llm_generator
,
test_llm_generator
,
def
test_m
edusa
_different_k
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify that m
lp
speculative decoding produces exact equality
"""Verify that m
edusa
speculative decoding produces exact equality
to without spec decode with different values of num_speculative_tokens.
"""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
...
...
@@ -208,9 +252,9 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
32
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_m
lp
_disable_queue
(
baseline_llm_generator
,
test_llm_generator
,
def
test_m
edusa
_disable_queue
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify that m
lp
speculative decoding produces exact equality
"""Verify that m
edusa
speculative decoding produces exact equality
to without spec decode when speculation is disabled for large
batch sizes.
"""
...
...
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
af7f4372
...
...
@@ -19,8 +19,12 @@ With those tests, we can say at least, MLPSpeculator would not break the
correctess for the target model outputs.
"""
from
unittest.mock
import
patch
import
pytest
from
vllm.model_executor.layers.vocab_parallel_embedding
import
pad_vocab_size
from
.conftest
import
(
run_equality_correctness_test
,
run_greedy_equality_correctness_test
)
...
...
@@ -78,6 +82,48 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
SPEC_MODEL
,
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
2048
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_mlp_e2e_acceptance_rate
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify acceptance rate with different batch size and large output
length."""
run_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
temperature
=
0.0
,
seeded
=
True
,
force_output_len
=
True
,
expected_acceptance_rate
=
0.48
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
...
...
@@ -178,6 +224,62 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
SPEC_MODEL
,
},
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use small output len for fast test.
128
,
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_mlp_e2e_greedy_correctness_with_padding
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify greedy equality when the vocab dimension is padded
"""
# Default pad_to is 64, test model has vocab_size of 32000
def
patched_pad_vocab_size
(
vocab_size
,
pad_to
=
None
):
return
pad_vocab_size
(
vocab_size
,
pad_to
=
32064
)
with
patch
(
"vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size"
,
patched_pad_vocab_size
):
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
...
...
Prev
1
…
8
9
10
11
12
13
14
15
16
…
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment