Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
500b93c8
Commit
500b93c8
authored
Jul 25, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1
parents
99426767
38c4b7e8
Changes
282
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
705 additions
and
71 deletions
+705
-71
tests/lora/test_lora_huggingface.py
tests/lora/test_lora_huggingface.py
+39
-0
tests/lora/test_utils.py
tests/lora/test_utils.py
+56
-1
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+103
-0
tests/models/test_chameleon.py
tests/models/test_chameleon.py
+102
-0
tests/models/test_fuyu.py
tests/models/test_fuyu.py
+4
-3
tests/models/test_jamba.py
tests/models/test_jamba.py
+83
-0
tests/models/test_llava.py
tests/models/test_llava.py
+0
-2
tests/models/test_llava_next.py
tests/models/test_llava_next.py
+0
-2
tests/models/test_paligemma.py
tests/models/test_paligemma.py
+21
-4
tests/models/test_phi3v.py
tests/models/test_phi3v.py
+8
-3
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+5
-5
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+7
-0
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+2
-2
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+35
-5
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+53
-3
tests/spec_decode/e2e/conftest.py
tests/spec_decode/e2e/conftest.py
+81
-16
tests/spec_decode/e2e/test_integration_dist_tp2.py
tests/spec_decode/e2e/test_integration_dist_tp2.py
+3
-0
tests/spec_decode/e2e/test_integration_dist_tp4.py
tests/spec_decode/e2e/test_integration_dist_tp4.py
+62
-0
tests/spec_decode/e2e/test_logprobs.py
tests/spec_decode/e2e/test_logprobs.py
+29
-19
tests/spec_decode/e2e/test_multistep_correctness.py
tests/spec_decode/e2e/test_multistep_correctness.py
+12
-6
No files found.
tests/lora/test_lora_huggingface.py
0 → 100644
View file @
500b93c8
from
typing
import
List
import
pytest
from
vllm.lora.models
import
LoRAModel
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
# Provide absolute path and huggingface lora ids
lora_fixture_name
=
[
"sql_lora_files"
,
"sql_lora_huggingface_id"
]
@
pytest
.
mark
.
parametrize
(
"lora_fixture_name"
,
lora_fixture_name
)
def
test_load_checkpoints_from_huggingface
(
lora_fixture_name
,
request
):
lora_name
=
request
.
getfixturevalue
(
lora_fixture_name
)
supported_lora_modules
=
LlamaForCausalLM
.
supported_lora_modules
packed_modules_mapping
=
LlamaForCausalLM
.
packed_modules_mapping
embedding_modules
=
LlamaForCausalLM
.
embedding_modules
embed_padding_modules
=
LlamaForCausalLM
.
embedding_padding_modules
expected_lora_modules
:
List
[
str
]
=
[]
for
module
in
supported_lora_modules
:
if
module
in
packed_modules_mapping
:
expected_lora_modules
.
extend
(
packed_modules_mapping
[
module
])
else
:
expected_lora_modules
.
append
(
module
)
lora_path
=
get_adapter_absolute_path
(
lora_name
)
# lora loading should work for either absolute path and hugggingface id.
lora_model
=
LoRAModel
.
from_local_checkpoint
(
lora_path
,
expected_lora_modules
,
lora_model_id
=
1
,
device
=
"cpu"
,
embedding_modules
=
embedding_modules
,
embedding_padding_modules
=
embed_padding_modules
)
# Assertions to ensure the model is loaded correctly
assert
lora_model
is
not
None
,
"LoRAModel is not loaded correctly"
tests/lora/test_utils.py
View file @
500b93c8
from
collections
import
OrderedDict
from
collections
import
OrderedDict
from
unittest.mock
import
patch
import
pytest
import
pytest
from
huggingface_hub.utils
import
HfHubHTTPError
from
torch
import
nn
from
torch
import
nn
from
vllm.lora.utils
import
parse_fine_tuned_lora_name
,
replace_submodule
from
vllm.lora.utils
import
(
get_adapter_absolute_path
,
parse_fine_tuned_lora_name
,
replace_submodule
)
from
vllm.utils
import
LRUCache
from
vllm.utils
import
LRUCache
...
@@ -182,3 +185,55 @@ def test_lru_cache():
...
@@ -182,3 +185,55 @@ def test_lru_cache():
assert
2
in
cache
assert
2
in
cache
assert
4
in
cache
assert
4
in
cache
assert
6
in
cache
assert
6
in
cache
# Unit tests for get_adapter_absolute_path
@
patch
(
'os.path.isabs'
)
def
test_get_adapter_absolute_path_absolute
(
mock_isabs
):
path
=
'/absolute/path/to/lora'
mock_isabs
.
return_value
=
True
assert
get_adapter_absolute_path
(
path
)
==
path
@
patch
(
'os.path.expanduser'
)
def
test_get_adapter_absolute_path_expanduser
(
mock_expanduser
):
# Path with ~ that needs to be expanded
path
=
'~/relative/path/to/lora'
absolute_path
=
'/home/user/relative/path/to/lora'
mock_expanduser
.
return_value
=
absolute_path
assert
get_adapter_absolute_path
(
path
)
==
absolute_path
@
patch
(
'os.path.exists'
)
@
patch
(
'os.path.abspath'
)
def
test_get_adapter_absolute_path_local_existing
(
mock_abspath
,
mock_exist
):
# Relative path that exists locally
path
=
'relative/path/to/lora'
absolute_path
=
'/absolute/path/to/lora'
mock_exist
.
return_value
=
True
mock_abspath
.
return_value
=
absolute_path
assert
get_adapter_absolute_path
(
path
)
==
absolute_path
@
patch
(
'huggingface_hub.snapshot_download'
)
@
patch
(
'os.path.exists'
)
def
test_get_adapter_absolute_path_huggingface
(
mock_exist
,
mock_snapshot_download
):
# Hugging Face model identifier
path
=
'org/repo'
absolute_path
=
'/mock/snapshot/path'
mock_exist
.
return_value
=
False
mock_snapshot_download
.
return_value
=
absolute_path
assert
get_adapter_absolute_path
(
path
)
==
absolute_path
@
patch
(
'huggingface_hub.snapshot_download'
)
@
patch
(
'os.path.exists'
)
def
test_get_adapter_absolute_path_huggingface_error
(
mock_exist
,
mock_snapshot_download
):
# Hugging Face model identifier with download error
path
=
'org/repo'
mock_exist
.
return_value
=
False
mock_snapshot_download
.
side_effect
=
HfHubHTTPError
(
"failed to query model info"
)
assert
get_adapter_absolute_path
(
path
)
==
path
tests/metrics/test_metrics.py
View file @
500b93c8
from
typing
import
List
from
typing
import
List
import
pytest
import
pytest
import
ray
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
from
vllm
import
EngineArgs
,
LLMEngine
from
vllm
import
EngineArgs
,
LLMEngine
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.metrics
import
RayPrometheusStatLogger
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
MODELS
=
[
MODELS
=
[
...
@@ -168,6 +170,55 @@ def test_engine_log_metrics_regression(
...
@@ -168,6 +170,55 @@ def test_engine_log_metrics_regression(
assert_metrics
(
engine
,
disable_log_stats
,
len
(
example_prompts
))
assert_metrics
(
engine
,
disable_log_stats
,
len
(
example_prompts
))
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
def
test_metric_spec_decode
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
k
=
5
with
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
,
speculative_model
=
model
,
num_speculative_tokens
=
k
,
use_v2_block_manager
=
True
)
as
vllm_model
:
# Force log interval to be 0 to catch all metrics.
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_loggers
[
'prometheus'
]
stat_logger
.
local_interval
=
0
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn
=
{
"gauge_spec_decode_draft_acceptance_rate"
:
lambda
v
:
0
<=
v
<=
1
,
"gauge_spec_decode_efficiency"
:
lambda
v
:
0
<=
v
<=
1
,
"counter_spec_decode_num_accepted_tokens"
:
lambda
v
:
0
<=
v
<=
k
,
"counter_spec_decode_num_draft_tokens"
:
lambda
v
:
v
==
k
,
"counter_spec_decode_num_emitted_tokens"
:
lambda
v
:
0
<=
v
<=
k
+
1
,
}
# Use one request to better inspect the metrics.
prompts
=
example_prompts
[:
1
]
_
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
for
metric_name
,
is_expected
in
metric_name_to_expected_fn
.
items
():
metric_val
=
getattr
(
stat_logger
.
metrics
,
metric_name
).
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
assert
is_expected
(
metric_val
),
(
f
"the value of metric
{
metric_name
}
(
{
metric_val
}
) "
"does not meet expectation"
)
def
assert_metrics
(
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
def
assert_metrics
(
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
num_requests
:
int
)
->
None
:
num_requests
:
int
)
->
None
:
if
disable_log_stats
:
if
disable_log_stats
:
...
@@ -192,3 +243,55 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
...
@@ -192,3 +243,55 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
labels
)
labels
)
assert
(
assert
(
metric_value
==
num_requests
),
"Metrics should be collected"
metric_value
==
num_requests
),
"Metrics should be collected"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
16
])
def
test_engine_log_metrics_ray
(
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
# This test is quite weak - it only checks that we can use
# RayPrometheusStatLogger without exceptions.
# Checking whether the metrics are actually emitted is unfortunately
# non-trivial.
# We have to run in a Ray task for Ray metrics to be emitted correctly
@
ray
.
remote
(
num_gpus
=
1
)
def
_inner
():
class
_RayPrometheusStatLogger
(
RayPrometheusStatLogger
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
_i
=
0
super
().
__init__
(
*
args
,
**
kwargs
)
def
log
(
self
,
*
args
,
**
kwargs
):
self
.
_i
+=
1
return
super
().
log
(
*
args
,
**
kwargs
)
engine_args
=
EngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
logger
=
_RayPrometheusStatLogger
(
local_interval
=
0.5
,
labels
=
dict
(
model_name
=
engine
.
model_config
.
served_model_name
),
max_model_len
=
engine
.
model_config
.
max_model_len
)
engine
.
add_logger
(
"ray"
,
logger
)
for
i
,
prompt
in
enumerate
(
example_prompts
):
engine
.
add_request
(
f
"request-id-
{
i
}
"
,
prompt
,
SamplingParams
(
max_tokens
=
max_tokens
),
)
while
engine
.
has_unfinished_requests
():
engine
.
step
()
assert
logger
.
_i
>
0
,
".log must be called at least once"
ray
.
get
(
_inner
.
remote
())
tests/models/test_chameleon.py
0 → 100644
View file @
500b93c8
import
re
from
typing
import
List
,
Optional
,
Type
import
pytest
from
vllm.multimodal.utils
import
rescale_image_size
from
..conftest
import
IMAGE_ASSETS
,
VllmRunner
,
_ImageAssets
pytestmark
=
pytest
.
mark
.
vlm
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"USER: <image>
\n
What's the content of the image?
\n
ASSISTANT:"
,
"cherry_blossom"
:
"USER: <image>
\n
What is the season?
\n
ASSISTANT:"
,
})
models
=
[
"facebook/chameleon-7b"
]
#TODO (ywang96): Add correctness test when chameleon is
# available on transformers.
def
run_test
(
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Test if the model can generate text given
a batch of images and prompts.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
with
vllm_runner
(
model
,
max_model_len
=
4096
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
for
prompts
,
images
in
inputs_per_image
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
,
images
=
images
)
for
i
in
range
(
len
(
vllm_outputs
)):
# format prompt back to original
replacements
=
{
"<racm3:break>"
:
""
,
"<eoss>"
:
""
,
"<reserved08706>"
:
""
}
pattern
=
'|'
.
join
(
replacements
.
keys
())
vllm_result
=
re
.
sub
(
pattern
,
lambda
match
:
replacements
[
match
.
group
(
0
)],
#noqa B023
vllm_outputs
[
i
][
1
])
vllm_result
=
vllm_result
.
replace
(
"<image>"
,
""
,
1023
)
assert
vllm_result
[:
len
(
prompts
[
i
])]
==
prompts
[
i
]
# assert at least 10 new characters are generated
# (to take stop token into account)
assert
len
(
vllm_outputs
[
i
][
1
])
-
len
(
prompts
[
i
])
>
10
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
run_test
(
vllm_runner
,
image_assets
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
tensor_parallel_size
=
1
,
)
tests/models/test_fuyu.py
View file @
500b93c8
...
@@ -12,9 +12,10 @@ from .utils import check_logprobs_close
...
@@ -12,9 +12,10 @@ from .utils import check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
pytestmark
=
pytest
.
mark
.
vlm
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"What's the content of the image?
\n
"
,
# noqa: E501
"stop_sign"
:
"cherry_blossom"
:
"What is the season?
\n
"
,
"What's the content of the image?
\n
"
,
"boardwalk"
:
"What's in this image?
\n
"
,
"cherry_blossom"
:
"What is the season?
\n
"
,
})
})
models
=
[
"adept/fuyu-8b"
]
models
=
[
"adept/fuyu-8b"
]
...
...
tests/models/test_jamba.py
View file @
500b93c8
import
pytest
import
pytest
from
tests.models.utils
import
check_outputs_equal
from
vllm.worker.model_runner
import
_get_graph_batch_size
from
vllm.worker.model_runner
import
_get_graph_batch_size
MODELS
=
[
"ai21labs/Jamba-tiny-random"
]
MODELS
=
[
"ai21labs/Jamba-tiny-random"
]
...
@@ -34,6 +35,34 @@ def test_models(
...
@@ -34,6 +35,34 @@ def test_models(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
20
])
def
test_batching
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
# To pass the small model tests, we need full precision.
for_loop_outputs
=
[]
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
for
prompt
in
example_prompts
:
for_loop_outputs
.
append
(
vllm_model
.
generate_greedy
([
prompt
],
max_tokens
)[
0
])
batched_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
for_loop_outputs
,
outputs_1_lst
=
batched_outputs
,
name_0
=
"for_loop_vllm"
,
name_1
=
"batched_vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
20
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
20
])
...
@@ -60,6 +89,60 @@ def test_mamba_cache_cg_padding(
...
@@ -60,6 +89,60 @@ def test_mamba_cache_cg_padding(
"Could be related to mamba cache not padded correctly"
)
"Could be related to mamba cache not padded correctly"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
20
])
def
test_models_preemption_recompute
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
# Tests that outputs are identical with and w/o preemtions (recompute)
assert
dtype
==
"float"
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
ENABLE_ARTIFICIAL_PREEMPT
=
True
preempt_vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
ENABLE_ARTIFICIAL_PREEMPT
=
False
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
preempt_vllm_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"vllm_preepmtions"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
example_prompts
,
)
->
None
:
# This test is for verifying that the Jamba inner state management doesn't
# collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum mamba block capacity.
# This could generally happen due to the fact that Jamba does support
# statelessness mechanism where it can cleanup new incoming requests in
# a single step.
try
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
10
)
as
vllm_model
:
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
10
)
except
ValueError
:
pytest
.
fail
(
"Jamba inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily "
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_state_cleanup
(
def
test_state_cleanup
(
...
...
tests/models/test_llava.py
View file @
500b93c8
...
@@ -16,8 +16,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
...
@@ -16,8 +16,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"USER: <image>
\n
What's the content of the image?
\n
ASSISTANT:"
,
"USER: <image>
\n
What's the content of the image?
\n
ASSISTANT:"
,
"cherry_blossom"
:
"cherry_blossom"
:
"USER: <image>
\n
What is the season?
\n
ASSISTANT:"
,
"USER: <image>
\n
What is the season?
\n
ASSISTANT:"
,
"boardwalk"
:
"USER: <image>
\n
What's in this image?
\n
ASSISTANT:"
,
})
})
IMAGE_TOKEN_ID
=
32000
IMAGE_TOKEN_ID
=
32000
...
...
tests/models/test_llava_next.py
View file @
500b93c8
...
@@ -23,8 +23,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
...
@@ -23,8 +23,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
f
"
{
_PREFACE
}
USER: <image>
\n
What's the content of the image? ASSISTANT:"
,
f
"
{
_PREFACE
}
USER: <image>
\n
What's the content of the image? ASSISTANT:"
,
"cherry_blossom"
:
"cherry_blossom"
:
f
"
{
_PREFACE
}
USER: <image>
\n
What is the season? ASSISTANT:"
,
f
"
{
_PREFACE
}
USER: <image>
\n
What is the season? ASSISTANT:"
,
"boardwalk"
:
f
"
{
_PREFACE
}
USER: <image>
\n
What's in this image? ASSISTANT:"
,
})
})
IMAGE_TOKEN_ID
=
32000
IMAGE_TOKEN_ID
=
32000
...
...
tests/models/test_paligemma.py
View file @
500b93c8
import
os
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
pytest
import
pytest
...
@@ -5,6 +6,7 @@ from transformers import AutoTokenizer
...
@@ -5,6 +6,7 @@ from transformers import AutoTokenizer
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_hip
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
.utils
import
check_logprobs_close
from
.utils
import
check_logprobs_close
...
@@ -12,15 +14,22 @@ from .utils import check_logprobs_close
...
@@ -12,15 +14,22 @@ from .utils import check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
pytestmark
=
pytest
.
mark
.
vlm
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"caption es"
,
"stop_sign"
:
"cherry_blossom"
:
"What is in the picture?"
,
"caption es"
,
"boardwalk"
:
"What is in the picture?"
,
"cherry_blossom"
:
"What is in the picture?"
,
})
})
IMAGE_TOKEN_ID
=
257152
IMAGE_TOKEN_ID
=
257152
models
=
[
"google/paligemma-3b-mix-224"
]
models
=
[
"google/paligemma-3b-mix-224"
]
# ROCm Triton FA can run into compilation issues with these models due to,
# excessive use of shared memory. Use other backends in the meantime.
# FIXME (mattwong, gshtrasb, hongxiayan)
if
is_hip
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
Optional
[
SampleLogprobs
]],
...
@@ -129,7 +138,15 @@ def run_test(
...
@@ -129,7 +138,15 @@ def run_test(
[
0.25
,
0.5
,
1.0
],
[
0.25
,
0.5
,
1.0
],
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
pytest
.
param
(
"float"
,
marks
=
pytest
.
mark
.
skipif
(
is_hip
(),
reason
=
"ROCm FA does not yet fully support 32-bit precision on PaliGemma"
)
),
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
...
...
tests/models/test_phi3v.py
View file @
500b93c8
import
os
import
re
import
re
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
List
,
Optional
,
Tuple
,
Type
...
@@ -6,7 +7,7 @@ from transformers import AutoTokenizer
...
@@ -6,7 +7,7 @@ from transformers import AutoTokenizer
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
from
vllm.utils
import
is_cpu
,
is_hip
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
.utils
import
check_logprobs_close
from
.utils
import
check_logprobs_close
...
@@ -18,8 +19,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
...
@@ -18,8 +19,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"<|user|>
\n
<|image_1|>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"<|user|>
\n
<|image_1|>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"cherry_blossom"
:
"cherry_blossom"
:
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
"boardwalk"
:
"<|user|>
\n
<|image_1|>
\n
What's in this image?<|end|>
\n
<|assistant|>
\n
"
,
})
})
models
=
[
"microsoft/Phi-3-vision-128k-instruct"
]
models
=
[
"microsoft/Phi-3-vision-128k-instruct"
]
...
@@ -49,6 +48,12 @@ target_dtype = "half"
...
@@ -49,6 +48,12 @@ target_dtype = "half"
if
is_cpu
():
if
is_cpu
():
target_dtype
=
"bfloat16"
target_dtype
=
"bfloat16"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if
is_hip
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
def
run_test
(
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
Type
[
HfRunner
],
...
...
tests/multimodal/test_utils.py
View file @
500b93c8
...
@@ -7,7 +7,7 @@ import numpy as np
...
@@ -7,7 +7,7 @@ import numpy as np
import
pytest
import
pytest
from
PIL
import
Image
from
PIL
import
Image
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
fetch_image
from
vllm.multimodal.utils
import
async_fetch_image
,
fetch_image
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
TEST_IMAGE_URLS
=
[
...
@@ -37,15 +37,15 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
...
@@ -37,15 +37,15 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
b
.
convert
(
a
.
mode
))).
all
()
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
b
.
convert
(
a
.
mode
))).
all
()
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_fetch_image_http
(
image_url
:
str
):
async
def
test_fetch_image_http
(
image_url
:
str
):
image_sync
=
fetch_image
(
image_url
)
image_sync
=
fetch_image
(
image_url
)
image_async
=
await
ImageFetchAiohttp
.
fetch_image
(
image_url
)
image_async
=
await
async_
fetch_image
(
image_url
)
assert
_image_equals
(
image_sync
,
image_async
)
assert
_image_equals
(
image_sync
,
image_async
)
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
async
def
test_fetch_image_base64
(
url_images
:
Dict
[
str
,
Image
.
Image
],
async
def
test_fetch_image_base64
(
url_images
:
Dict
[
str
,
Image
.
Image
],
...
@@ -78,5 +78,5 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
...
@@ -78,5 +78,5 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
else
:
else
:
pass
# Lossy format; only check that image can be opened
pass
# Lossy format; only check that image can be opened
data_image_async
=
await
ImageFetchAiohttp
.
fetch_image
(
data_url
)
data_image_async
=
await
async_
fetch_image
(
data_url
)
assert
_image_equals
(
data_image_sync
,
data_image_async
)
assert
_image_equals
(
data_image_sync
,
data_image_async
)
tests/quantization/test_compressed_tensors.py
View file @
500b93c8
...
@@ -150,3 +150,10 @@ def test_compressed_tensors_fp8(vllm_runner):
...
@@ -150,3 +150,10 @@ def test_compressed_tensors_fp8(vllm_runner):
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
assert
output
assert
output
def
test_compressed_tensors_kv_cache
(
vllm_runner
):
model_path
=
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
output
=
llm
.
generate_greedy
(
"Hello world!"
,
max_tokens
=
20
)
assert
output
tests/quantization/test_configs.py
View file @
500b93c8
...
@@ -44,9 +44,9 @@ MODEL_ARG_EXPTYPES = [
...
@@ -44,9 +44,9 @@ MODEL_ARG_EXPTYPES = [
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"awq"
,
"ERROR"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"awq"
,
"ERROR"
),
# AUTOAWQ
# AUTOAWQ
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
None
,
"awq"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
None
,
"awq
_marlin
"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"awq"
,
"awq"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"awq"
,
"awq"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"marlin"
,
"
ERROR
"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"marlin"
,
"
awq_marlin
"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"gptq"
,
"ERROR"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"gptq"
,
"ERROR"
),
]
]
...
...
tests/quantization/test_fp8.py
View file @
500b93c8
...
@@ -7,19 +7,49 @@ import torch
...
@@ -7,19 +7,49 @@ import torch
from
tests.quantization.utils
import
is_quant_method_supported
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.fp8
import
Fp8LinearMethod
from
vllm.model_executor.layers.quantization.fp8
import
(
Fp8KVCacheMethod
,
Fp8LinearMethod
)
MODELS
=
[
MODELS
=
[
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8
-KV
"
,
"nm-testing/Phi-3-mini-128k-instruct-FP8"
,
"nm-testing/Phi-3-mini-128k-instruct-FP8"
,
]
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
MODELS
)
def
test_model_load_and_run
(
vllm_runner
,
model
:
str
):
def
test_model_load_and_run
(
vllm_runner
,
model_id
:
str
):
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model_id
)
as
llm
:
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs
=
llm
.
generate_greedy
(
prompts
=
[
"Hello my name is"
],
max_tokens
=
10
)
print
(
outputs
[
0
][
1
])
KV_CACHE_MODELS
=
[
# Deprecated AutoFP8 format using .kv_scale
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
# AutoFP8 format using separate .k_scale and .v_scale
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"
,
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
KV_CACHE_MODELS
)
def
test_kv_cache_model_load_and_run
(
vllm_runner
,
model_id
:
str
):
with
vllm_runner
(
model_id
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
attn
=
model
.
model
.
layers
[
0
].
self_attn
.
attn
assert
isinstance
(
attn
.
quant_method
,
Fp8KVCacheMethod
)
# NOTE: it is valid for scales to be 1.0 (default value), but we know
# these checkpoints have scales < 1.0
assert
0.0
<
attn
.
_k_scale
<
1.0
assert
0.0
<
attn
.
_v_scale
<
1.0
# note: this does not test accuracy, just that we can run through
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
# see lm-eval tests for accuracy
outputs
=
llm
.
generate_greedy
(
prompts
=
[
"Hello my name is"
],
outputs
=
llm
.
generate_greedy
(
prompts
=
[
"Hello my name is"
],
...
...
tests/samplers/test_rejection_sampler.py
View file @
500b93c8
...
@@ -150,9 +150,54 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
...
@@ -150,9 +150,54 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
high
=
vocab_size
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
dtype
=
torch
.
int64
)
generators
=
[
None
]
*
batch_size
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
)
draft_token_ids
,
generators
)
@
pytest
.
mark
.
parametrize
(
"frac_seeded"
,
[
0.0
,
0.25
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1
,
3
,
6
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"n_rep"
,
[
100
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_deterministic_when_seeded
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
frac_seeded
:
float
,
n_rep
:
int
,
device
:
str
):
torch
.
set_default_device
(
device
)
rejection_sampler
=
RejectionSampler
()
rejection_sampler
.
init_gpu_tensors
(
rank
=
0
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
target_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
bonus_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
draft_token_ids
=
torch
.
randint
(
low
=
0
,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
seeded_mask
=
torch
.
rand
(
batch_size
,
dtype
=
torch
.
float32
)
<=
frac_seeded
results
=
[]
for
_
in
range
(
n_rep
):
generators
=
[
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
if
seeded_mask
[
i
]
else
None
for
i
in
range
(
batch_size
)
]
results
.
append
(
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
generators
))
for
i
in
range
(
batch_size
):
if
seeded_mask
[
i
]:
for
j
in
range
(
1
,
n_rep
):
assert
torch
.
equal
(
results
[
j
][
i
],
results
[
0
][
i
])
@
pytest
.
mark
.
parametrize
(
"above_or_below_vocab_range"
,
[
"above"
,
"below"
])
@
pytest
.
mark
.
parametrize
(
"above_or_below_vocab_range"
,
[
"above"
,
"below"
])
...
@@ -197,10 +242,11 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
...
@@ -197,10 +242,11 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
raise
AssertionError
()
raise
AssertionError
()
oob_token_ids
[
0
][
0
]
=
rogue_token_id
oob_token_ids
[
0
][
0
]
=
rogue_token_id
generators
=
[
None
]
*
batch_size
with
pytest
.
raises
(
AssertionError
):
with
pytest
.
raises
(
AssertionError
):
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
)
draft_token_ids
,
generators
)
@
pytest
.
mark
.
parametrize
(
"draft_and_target_probs_equal"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"draft_and_target_probs_equal"
,
[
True
,
False
])
...
@@ -371,11 +417,15 @@ class _CorrectnessTestHelper:
...
@@ -371,11 +417,15 @@ class _CorrectnessTestHelper:
dtype
=
torch
.
int64
,
dtype
=
torch
.
int64
,
device
=
"cuda"
).
repeat
(
num_samples
,
1
)
device
=
"cuda"
).
repeat
(
num_samples
,
1
)
# unseeded
generators
=
[
None
]
# Get output tokens via rejection sampling.
# Get output tokens via rejection sampling.
output_token_ids
=
self
.
rejection_sampler
(
target_probs
.
to
(
"cuda"
),
output_token_ids
=
self
.
rejection_sampler
(
target_probs
.
to
(
"cuda"
),
bonus_token_ids
.
to
(
"cuda"
),
bonus_token_ids
.
to
(
"cuda"
),
draft_probs
.
to
(
"cuda"
),
draft_probs
.
to
(
"cuda"
),
draft_token_ids
.
to
(
"cuda"
))
draft_token_ids
.
to
(
"cuda"
),
generators
)
# Remove bonus tokens
# Remove bonus tokens
output_token_ids
=
output_token_ids
[:,
:
-
1
].
flatten
()
output_token_ids
=
output_token_ids
[:,
:
-
1
].
flatten
()
...
...
tests/spec_decode/e2e/conftest.py
View file @
500b93c8
import
asyncio
import
asyncio
from
itertools
import
cycle
from
itertools
import
cycle
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
import
pytest
import
pytest
import
ray
import
ray
...
@@ -128,7 +128,9 @@ class AsyncLLM:
...
@@ -128,7 +128,9 @@ class AsyncLLM:
try
:
try
:
for
i
in
range
(
num_requests
):
for
i
in
range
(
num_requests
):
prompt
=
prompts
[
i
]
if
prompts
is
not
None
else
None
prompt
=
prompts
[
i
]
if
prompts
is
not
None
else
None
res
=
asyncio
.
run
(
get_output
(
prompt
,
sampling_params
))
params
=
sampling_params
[
i
]
if
isinstance
(
sampling_params
,
Sequence
)
else
sampling_params
res
=
asyncio
.
run
(
get_output
(
prompt
,
params
))
outputs
.
append
(
res
)
outputs
.
append
(
res
)
finally
:
finally
:
ray
.
shutdown
()
ray
.
shutdown
()
...
@@ -162,6 +164,11 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
...
@@ -162,6 +164,11 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
}
}
test_name
=
request
.
node
.
name
test_name
=
request
.
node
.
name
model
=
kwargs
[
"model"
]
draft_model
=
kwargs
.
get
(
"speculative_model"
,
None
)
same_draft_target_model
=
(
draft_model
is
not
None
and
draft_model
==
model
)
def
generator_inner
():
def
generator_inner
():
wait_for_gpu_memory_to_clear
(
wait_for_gpu_memory_to_clear
(
...
@@ -177,6 +184,13 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
...
@@ -177,6 +184,13 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
print
(
f
'Creating
{
baseline_or_test
=
}
LLM for
{
test_name
=
}
.
{
kwargs
=
}
'
)
print
(
f
'Creating
{
baseline_or_test
=
}
LLM for
{
test_name
=
}
.
{
kwargs
=
}
'
)
llm
=
AsyncLLM
(
**
kwargs
)
if
use_async
else
LLM
(
**
kwargs
)
llm
=
AsyncLLM
(
**
kwargs
)
if
use_async
else
LLM
(
**
kwargs
)
# Override logging interval to 0 for spec decode test run to
# log all metrics in time.
if
(
baseline_or_test
==
"test"
and
not
use_async
and
llm
.
llm_engine
.
log_stats
):
for
sate_logger
in
llm
.
llm_engine
.
stat_loggers
.
values
():
sate_logger
.
local_interval
=
0
set_random_seed
(
seed
)
set_random_seed
(
seed
)
yield
llm
yield
llm
...
@@ -188,6 +202,9 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
...
@@ -188,6 +202,9 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
yield
llm
yield
llm
del
llm
del
llm
# Set an attribute to the generator_outer function to allow us to
# determine whether to further check the acceptance rate in tests.
generator_outer
.
same_draft_target_model
=
same_draft_target_model
# type: ignore
return
generator_outer
return
generator_outer
...
@@ -204,18 +221,27 @@ def maybe_assert_ngram_worker(llm):
...
@@ -204,18 +221,27 @@ def maybe_assert_ngram_worker(llm):
def
get_output_from_llm_generator
(
def
get_output_from_llm_generator
(
llm_generator
,
prompts
,
llm_generator
,
prompts
,
sampling_params
)
->
Tuple
[
List
[
str
],
List
[
List
[
int
]]]:
sampling_params
)
->
Tuple
[
List
[
str
],
List
[
List
[
int
]]
,
float
]:
tokens
:
List
[
str
]
=
[]
tokens
:
List
[
str
]
=
[]
token_ids
:
List
[
List
[
int
]]
=
[]
token_ids
:
List
[
List
[
int
]]
=
[]
acceptance_rate
:
float
=
-
1.0
for
llm
in
llm_generator
():
for
llm
in
llm_generator
():
maybe_assert_ngram_worker
(
llm
)
maybe_assert_ngram_worker
(
llm
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
token_ids
=
[
output
.
outputs
[
0
].
token_ids
for
output
in
outputs
]
token_ids
=
[
output
.
outputs
[
0
].
token_ids
for
output
in
outputs
]
tokens
=
[
output
.
outputs
[
0
].
text
for
output
in
outputs
]
tokens
=
[
output
.
outputs
[
0
].
text
for
output
in
outputs
]
# Fetch acceptance rate if logging is enabled.
if
stat_loggers
:
=
getattr
(
llm
.
llm_engine
,
"stat_loggers"
,
None
):
stat_logger
=
stat_loggers
[
"prometheus"
]
acceptance_rate
=
(
stat_logger
.
metrics
.
gauge_spec_decode_draft_acceptance_rate
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
())
del
llm
del
llm
return
tokens
,
token_ids
return
tokens
,
token_ids
,
acceptance_rate
def
get_logprobs_from_llm_generator
(
def
get_logprobs_from_llm_generator
(
...
@@ -237,12 +263,37 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
...
@@ -237,12 +263,37 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
batch_size
,
batch_size
,
max_output_len
,
max_output_len
,
force_output_len
:
bool
,
force_output_len
:
bool
,
print_tokens
:
bool
=
False
):
print_tokens
:
bool
=
False
,
ensure_all_accepted
:
bool
=
False
):
"""Helper method that compares the outputs of both the baseline LLM and
"""Helper method that compares the outputs of both the baseline LLM and
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
the same when temperature is zero.
the same when temperature is zero.
"""
"""
temperature
=
0.0
run_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
,
force_output_len
,
temperature
=
0.0
,
seeded
=
False
,
print_tokens
=
print_tokens
,
ensure_all_accepted
=
ensure_all_accepted
)
def
run_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
,
force_output_len
:
bool
,
temperature
:
float
,
seeded
:
bool
,
print_tokens
:
bool
=
False
,
ensure_all_accepted
:
bool
=
False
):
"""Helper method that compares the outputs of both the baseline LLM and
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
the same when temperature is zero (or when temperature is > 0 and seeded).
"""
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -261,18 +312,29 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
...
@@ -261,18 +312,29 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
# sampling params to ignore eos token.
# sampling params to ignore eos token.
ignore_eos
=
force_output_len
ignore_eos
=
force_output_len
sampling_params
=
SamplingParams
(
if
seeded
:
max_tokens
=
max_output_len
,
sampling_params
=
[
ignore_eos
=
ignore_eos
,
SamplingParams
(
temperature
=
temperature
,
max_tokens
=
max_output_len
,
)
ignore_eos
=
ignore_eos
,
temperature
=
temperature
,
seed
=
i
,
)
for
i
in
range
(
len
(
prompts
))
]
else
:
sampling_params
=
SamplingParams
(
max_tokens
=
max_output_len
,
ignore_eos
=
ignore_eos
,
temperature
=
temperature
,
)
spec_batch_tokens
,
spec_batch_token_ids
=
get_output_from_llm_generator
(
(
spec_batch_tokens
,
spec_batch_token_ids
,
test_llm_generator
,
prompts
,
sampling_params
)
acceptance_rate
)
=
get_output_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
(
baseline_batch_tokens
,
(
baseline_batch_tokens
,
baseline_batch_token_ids
,
baseline_batch_token_ids
)
=
get_output_from_llm_generator
(
_
)
=
get_output_from_llm_generator
(
baseline_llm_generator
,
prompts
,
baseline_llm_generator
,
prompts
,
sampling_params
)
sampling_params
)
assert
len
(
baseline_batch_token_ids
)
==
len
(
prompts
)
assert
len
(
baseline_batch_token_ids
)
==
len
(
prompts
)
assert
len
(
spec_batch_token_ids
)
==
len
(
prompts
)
assert
len
(
spec_batch_token_ids
)
==
len
(
prompts
)
...
@@ -287,3 +349,6 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
...
@@ -287,3 +349,6 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
print
(
f
'
{
i
=
}
{
baseline_token_ids
=
}
'
)
print
(
f
'
{
i
=
}
{
baseline_token_ids
=
}
'
)
print
(
f
'
{
i
=
}
{
spec_token_ids
=
}
'
)
print
(
f
'
{
i
=
}
{
spec_token_ids
=
}
'
)
assert
baseline_token_ids
==
spec_token_ids
assert
baseline_token_ids
==
spec_token_ids
if
ensure_all_accepted
:
assert
acceptance_rate
==
1.0
tests/spec_decode/e2e/test_integration_dist_tp2.py
View file @
500b93c8
...
@@ -83,6 +83,9 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
...
@@ -83,6 +83,9 @@ def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
# cleaned up properly, and its server host thread leaks, causing the
# cleaned up properly, and its server host thread leaks, causing the
# second run of the test to fail with internal NCCL error.
# second run of the test to fail with internal NCCL error.
"use_async"
:
True
,
"use_async"
:
True
,
# precision
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
...
tests/spec_decode/e2e/test_integration_dist_tp4.py
View file @
500b93c8
...
@@ -58,3 +58,65 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
...
@@ -58,3 +58,65 @@ def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
batch_size
,
batch_size
,
max_output_len
=
32
,
max_output_len
=
32
,
force_output_len
=
True
)
force_output_len
=
True
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
"tensor_parallel_size"
:
4
,
# Use AsyncLLM engine, so that the engine runs in its own process.
# Otherwise, since vLLM does not follow true SPMD, the test runner
# process will have both the engine and the rank0 worker. NCCL is not
# cleaned up properly, and its server host thread leaks, causing the
# second run of the test to fail with internal NCCL error.
"use_async"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"speculative_model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
5
,
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"speculative_max_model_len"
:
32
,
},
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# This must be a good bit larger than speculative_max_model_len so that
# we can test the case where all seqs are skipped, but still small to
# ensure fast test.
64
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_skip_speculation
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
):
"""Verify job failure with RuntimeError when all sequences skip speculation.
We do this by setting the max model len of the draft model to an
artificially low value, such that when the sequences grow beyond it, they
are skipped in speculative decoding.
TODO: fix it to pass without raising Error. (#5814)
"""
with
pytest
.
raises
(
RuntimeError
):
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
)
tests/spec_decode/e2e/test_logprobs.py
View file @
500b93c8
...
@@ -22,10 +22,12 @@ from .conftest import get_logprobs_from_llm_generator
...
@@ -22,10 +22,12 @@ from .conftest import get_logprobs_from_llm_generator
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
"speculative_model"
:
"JackFram/llama-160m"
,
[{
"num_speculative_tokens"
:
3
,
"speculative_model"
:
"JackFram/llama-160m"
,
}])
"num_speculative_tokens"
:
3
,
"disable_logprobs_during_spec_decoding"
:
False
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"output_len"
,
"output_len"
,
...
@@ -59,10 +61,12 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
...
@@ -59,10 +61,12 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
"speculative_model"
:
"JackFram/llama-160m"
,
[{
"num_speculative_tokens"
:
3
,
"speculative_model"
:
"JackFram/llama-160m"
,
}])
"num_speculative_tokens"
:
3
,
"disable_logprobs_during_spec_decoding"
:
False
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
6
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
6
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -99,13 +103,16 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
...
@@ -99,13 +103,16 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
"speculative_model"
:
"JackFram/llama-160m"
,
[{
"num_speculative_tokens"
:
3
,
"speculative_model"
:
"JackFram/llama-160m"
,
},
{
"num_speculative_tokens"
:
3
,
"speculative_model"
:
"JackFram/llama-160m"
,
"disable_logprobs_during_spec_decoding"
:
False
,
"num_speculative_tokens"
:
6
,
},
{
}])
"speculative_model"
:
"JackFram/llama-160m"
,
"num_speculative_tokens"
:
6
,
"disable_logprobs_during_spec_decoding"
:
False
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"output_len"
,
"output_len"
,
...
@@ -143,6 +150,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
...
@@ -143,6 +150,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
[{
[{
"speculative_model"
:
"JackFram/llama-160m"
,
"speculative_model"
:
"JackFram/llama-160m"
,
"num_speculative_tokens"
:
3
,
"num_speculative_tokens"
:
3
,
"disable_logprobs_during_spec_decoding"
:
False
,
# Artificially limit the draft model max model len; this forces vLLM
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
# to skip speculation once the sequences grow beyond 32-k tokens.
...
@@ -181,10 +189,12 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
...
@@ -181,10 +189,12 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
"speculative_model"
:
"JackFram/llama-160m"
,
[{
"num_speculative_tokens"
:
3
,
"speculative_model"
:
"JackFram/llama-160m"
,
}])
"num_speculative_tokens"
:
3
,
"disable_logprobs_during_spec_decoding"
:
False
,
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"output_len"
,
"output_len"
,
...
...
tests/spec_decode/e2e/test_multistep_correctness.py
View file @
500b93c8
...
@@ -97,7 +97,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
...
@@ -97,7 +97,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
temperature
=
temperature
,
temperature
=
temperature
,
)
)
batch_tokens
,
batch_token_ids
=
get_output_from_llm_generator
(
batch_tokens
,
batch_token_ids
,
_
=
get_output_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
test_llm_generator
,
prompts
,
sampling_params
)
# Expect a generation for each prompt in the batch.
# Expect a generation for each prompt in the batch.
...
@@ -200,12 +200,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
...
@@ -200,12 +200,18 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
Since this test is cheaper than other e2e correctness tests, we generate
Since this test is cheaper than other e2e correctness tests, we generate
with a higher output_len.
with a higher output_len.
When the draft model is the same as the target model, we further check
whether all speculative tokens are accepted.
"""
"""
run_greedy_equality_correctness_test
(
baseline_llm_generator
,
ensure_all_accepted
=
test_llm_generator
.
same_draft_target_model
test_llm_generator
,
run_greedy_equality_correctness_test
(
batch_size
,
baseline_llm_generator
,
max_output_len
=
output_len
,
test_llm_generator
,
force_output_len
=
True
)
batch_size
,
max_output_len
=
output_len
,
force_output_len
=
True
,
ensure_all_accepted
=
ensure_all_accepted
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment