Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
735 additions
and
266 deletions
+735
-266
tests/lora/test_long_context.py
tests/lora/test_long_context.py
+13
-10
tests/lora/test_utils.py
tests/lora/test_utils.py
+13
-1
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+42
-42
tests/models/test_aqlm.py
tests/models/test_aqlm.py
+10
-8
tests/models/test_big_models.py
tests/models/test_big_models.py
+17
-14
tests/models/test_embedding.py
tests/models/test_embedding.py
+4
-6
tests/models/test_fp8.py
tests/models/test_fp8.py
+7
-4
tests/models/test_gptq_marlin.py
tests/models/test_gptq_marlin.py
+24
-24
tests/models/test_gptq_marlin_24.py
tests/models/test_gptq_marlin_24.py
+16
-17
tests/models/test_llava.py
tests/models/test_llava.py
+74
-71
tests/models/test_llava_next.py
tests/models/test_llava_next.py
+123
-0
tests/models/test_marlin.py
tests/models/test_marlin.py
+17
-18
tests/models/test_mistral.py
tests/models/test_mistral.py
+6
-9
tests/models/test_models.py
tests/models/test_models.py
+9
-12
tests/multimodal/__init__.py
tests/multimodal/__init__.py
+0
-0
tests/multimodal/test_processor.py
tests/multimodal/test_processor.py
+149
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+75
-0
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+80
-0
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+51
-25
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+5
-5
No files found.
tests/lora/test_long_context.py
View file @
f48954a4
...
...
@@ -102,22 +102,21 @@ def batched_generate(
return
[
outputs
[
i
].
outputs
[
0
].
text
.
strip
()
for
i
in
range
(
len
(
outputs
))]
@
pytest
.
fixture
@
pytest
.
fixture
(
scope
=
"module"
)
def
lora_llm
(
long_context_infos
):
scaling_factors
=
[
context_len_to_scaling_factor
[
info
[
"context_length"
]]
for
info
in
long_context_infos
.
values
()
]
llm
=
vllm
.
LLM
(
"meta-llama/Llama-2-13b-chat-hf"
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
2
,
long_lora_scaling_factors
=
tuple
(
scaling_factors
),
max_num_batched_tokens
=
4096
*
8
,
tensor_parallel_size
=
4
,
)
llm
=
vllm
.
LLM
(
"meta-llama/Llama-2-13b-chat-hf"
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
2
,
long_lora_scaling_factors
=
tuple
(
scaling_factors
),
max_num_batched_tokens
=
4096
*
8
,
tensor_parallel_size
=
4
,
distributed_executor_backend
=
"mp"
)
yield
llm
del
llm
...
...
@@ -154,6 +153,7 @@ def test_rotary_emb_replaced(dist_init):
assert
rotary_emb_count
==
32
@
pytest
.
mark
.
skip_global_cleanup
def
test_batched_rope_kernel
(
lora_llm
,
long_context_infos
):
"""We test the batched kernel by comparing the results of batched an
non-batched generation.
...
...
@@ -188,6 +188,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
f
"same:
\n
{
batched
}
\n
{
non_batched
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_self_consistency
(
lora_llm
,
long_context_infos
):
"""We test consistency of the batched kernel by permuting batched
inputs and comparing the results to the non-permuted batched results.
...
...
@@ -227,6 +228,7 @@ def test_self_consistency(lora_llm, long_context_infos):
f
"
\n
{
permutated_batched_results
[
permutation
[
i
]]
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_quality
(
lora_llm
,
long_context_infos
):
"""We test the quality of the answers given by the LoRA model by
comparing the generated text to the merged model's outputs.
...
...
@@ -257,6 +259,7 @@ def test_quality(lora_llm, long_context_infos):
assert
np
.
mean
(
scores
)
>
0.5
@
pytest
.
mark
.
skip_global_cleanup
def
test_max_len
(
lora_llm
,
long_context_infos
):
"""Test that we raise an ValueError when the input of a given LoRA
model exceeds the maximum length."""
...
...
tests/lora/test_utils.py
View file @
f48954a4
from
collections
import
OrderedDict
import
pytest
from
torch
import
nn
from
vllm.lora.utils
import
parse_fine_tuned_lora_name
,
replace_submodule
from
vllm.utils
import
LRUCache
def
test_parse_fine_tuned_lora_name
():
def
test_parse_fine_tuned_lora_name
_valid
():
fixture
=
{
(
"base_model.model.lm_head.lora_A.weight"
,
"lm_head"
,
True
),
(
"base_model.model.lm_head.lora_B.weight"
,
"lm_head"
,
False
),
...
...
@@ -35,6 +36,17 @@ def test_parse_fine_tuned_lora_name():
assert
(
module_name
,
is_lora_a
)
==
parse_fine_tuned_lora_name
(
name
)
def
test_parse_fine_tuned_lora_name_invalid
():
fixture
=
{
"weight"
,
"base_model.weight"
,
"base_model.model.weight"
,
}
for
name
in
fixture
:
with
pytest
.
raises
(
ValueError
,
match
=
"unsupported LoRA weight"
):
parse_fine_tuned_lora_name
(
name
)
def
test_replace_submodule
():
model
=
nn
.
Sequential
(
OrderedDict
([
...
...
tests/metrics/test_metrics.py
View file @
f48954a4
...
...
@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
prompt_token_counts
=
[
len
(
tokenizer
.
encode
(
p
))
for
p
in
example_prompts
]
# This test needs at least 2 prompts in a batch of different lengths to
# verify their token count is correct despite padding.
assert
len
(
example_prompts
)
>
1
,
"at least 2 prompts are required"
assert
prompt_token_counts
[
0
]
!=
prompt_token_counts
[
1
],
(
"prompts of different lengths are required"
)
vllm_prompt_token_count
=
sum
(
prompt_token_counts
)
_
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metric_count
=
stat_logger
.
metrics
.
counter_prompt_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
with
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
prompt_token_counts
=
[
len
(
tokenizer
.
encode
(
p
))
for
p
in
example_prompts
]
# This test needs at least 2 prompts in a batch of different lengths to
# verify their token count is correct despite padding.
assert
len
(
example_prompts
)
>
1
,
"at least 2 prompts are required"
assert
prompt_token_counts
[
0
]
!=
prompt_token_counts
[
1
],
(
"prompts of different lengths are required"
)
vllm_prompt_token_count
=
sum
(
prompt_token_counts
)
_
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metric_count
=
stat_logger
.
metrics
.
counter_prompt_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
assert
vllm_prompt_token_count
==
metric_count
,
(
f
"prompt token count:
{
vllm_prompt_token_count
!
r
}
\n
"
...
...
@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metric_count
=
stat_logger
.
metrics
.
counter_generation_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
vllm_generation_count
=
0
for
i
in
range
(
len
(
example_prompts
)):
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
prompt_ids
=
tokenizer
.
encode
(
example_prompts
[
i
])
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count
+=
len
(
vllm_output_ids
)
-
len
(
prompt_ids
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metric_count
=
stat_logger
.
metrics
.
counter_generation_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
vllm_generation_count
=
0
for
i
in
range
(
len
(
example_prompts
)):
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
prompt_ids
=
tokenizer
.
encode
(
example_prompts
[
i
])
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count
+=
len
(
vllm_output_ids
)
-
len
(
prompt_ids
)
assert
vllm_generation_count
==
metric_count
,
(
f
"generation token count:
{
vllm_generation_count
!
r
}
\n
"
...
...
@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
[
None
,
[],
[
"ModelName0"
],
[
"ModelName0"
,
"ModelName1"
,
"ModelName2"
]])
def
test_metric_set_tag_model_name
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
served_model_name
:
List
[
str
])
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.3
,
served_model_name
=
served_model_name
)
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.3
,
served_model_name
=
served_model_name
)
as
vllm_model
:
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
if
served_model_name
is
None
or
served_model_name
==
[]:
assert
metrics_tag_content
==
model
,
(
...
...
tests/models/test_aqlm.py
View file @
f48954a4
...
...
@@ -8,10 +8,13 @@ import torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
aqlm_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
# In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency
...
...
@@ -79,10 +82,9 @@ def test_models(
num_logprobs
:
int
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
# loop through the prompts to compare against the ground truth generations
for
prompt_idx
in
range
(
len
(
example_prompts
)):
...
...
tests/models/test_big_models.py
View file @
f48954a4
...
...
@@ -5,6 +5,7 @@ This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import
pytest
import
torch
MODELS
=
[
"meta-llama/Llama-2-7b-hf"
,
...
...
@@ -16,9 +17,14 @@ MODELS = [
# "Qwen/Qwen1.5-0.5B" # Broken,
]
#TODO: remove this after CPU float16 support ready
target_dtype
=
"float"
if
torch
.
cuda
.
is_available
():
target_dtype
=
"half"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_models
(
hf_runner
,
...
...
@@ -28,13 +34,11 @@ def test_models(
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
@@ -46,15 +50,14 @@ def test_models(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
def
test_model_print
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
tests/models/test_embedding.py
View file @
f48954a4
...
...
@@ -28,13 +28,11 @@ def test_models(
model
:
str
,
dtype
:
str
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
del
hf_model
with
hf_runner
(
model
,
dtype
=
dtype
,
is_embedding_model
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
similarities
=
compare_embeddings
(
hf_outputs
,
vllm_outputs
)
all_similarities
=
torch
.
stack
(
similarities
)
...
...
tests/models/test_fp8.py
View file @
f48954a4
...
...
@@ -67,10 +67,13 @@ EXPECTED_STRS_MAP = {
},
}
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
fp8_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
fp8_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
fp8_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
@
pytest
.
mark
.
skipif
(
fp8_not_supported
,
...
...
tests/models/test_gptq_marlin.py
View file @
f48954a4
...
...
@@ -22,10 +22,13 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN
=
1024
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
gptq_marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
MODELS
=
[
# act_order==False, group_size=channelwise
...
...
@@ -67,32 +70,29 @@ def test_models(
model_name
,
revision
=
model
# Run marlin.
gptq_marlin_model
=
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
dtype
=
dtype
,
quantization
=
"marlin"
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
gptq_marlin_outputs
=
gptq_marlin_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
del
gptq_marlin_model
with
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
dtype
=
dtype
,
quantization
=
"marlin"
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
as
gptq_marlin_model
:
gptq_marlin_outputs
=
gptq_marlin_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
_ROPE_DICT
.
clear
()
# clear rope cache to avoid rope dtype error
# Run gptq.
# The naive gptq kernel doesn't support bf16 yet.
# Here we always compare fp16/bf16 gpt marlin kernel
# to fp16 gptq kernel.
gptq_model
=
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
dtype
=
"half"
,
quantization
=
"gptq"
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
del
gptq_model
with
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
dtype
=
"half"
,
quantization
=
"gptq"
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
as
gptq_model
:
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
...
...
tests/models/test_gptq_marlin_24.py
View file @
f48954a4
...
...
@@ -14,10 +14,13 @@ import torch
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
...
...
@@ -58,20 +61,16 @@ def test_models(
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
marlin_24_model
=
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"gptq_marlin_24"
)
marlin_24_outputs
=
marlin_24_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
marlin_24_model
with
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"gptq_marlin_24"
)
as
marlin_24_model
:
marlin_24_outputs
=
marlin_24_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
quantization
=
"gptq"
)
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
gptq_model
with
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
quantization
=
"gptq"
)
as
gptq_model
:
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
...
...
tests/models/test_llava.py
View file @
f48954a4
import
gc
from
dataclasses
import
fields
from
enum
import
Enum
from
typing
import
Dict
,
List
,
Tuple
from
typing
import
List
,
Tuple
import
pytest
import
torch
from
transformers
import
AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
model_and_vl_config
=
[
(
"llava-hf/llava-1.5-7b-hf"
,
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_feature_size
=
576
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
3
,
336
,
336
))),
(
"llava-hf/llava-1.5-7b-hf"
,
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
IMAGE_FEATURES
,
image_feature_size
=
576
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
576
,
1024
)))
from
..conftest
import
IMAGE_FILES
pytestmark
=
pytest
.
mark
.
llava
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
[
"<image>
\n
USER: What's the content of the image?
\n
ASSISTANT:"
,
"<image>
\n
USER: What is the season?
\n
ASSISTANT:"
,
]
assert
len
(
HF_IMAGE_PROMPTS
)
==
len
(
IMAGE_FILES
)
def
as_dict
(
vision_language_config
:
VisionLanguageConfig
)
->
Dict
:
"""Flatten vision language config to pure args.
Compatible with what llm entrypoint expects.
"""
result
=
{}
for
field
in
fields
(
vision_language_config
):
value
=
getattr
(
vision_language_config
,
field
.
name
)
if
isinstance
(
value
,
Enum
):
result
[
field
.
name
]
=
value
.
name
.
lower
()
elif
isinstance
(
value
,
tuple
):
result
[
field
.
name
]
=
","
.
join
([
str
(
item
)
for
item
in
value
])
else
:
result
[
field
.
name
]
=
value
return
result
def
sanitize_vllm_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
],
vision_language_config
:
VisionLanguageConfig
,
model_id
:
str
):
def
iter_llava_configs
(
model_name
:
str
):
image_hw_to_feature_size
=
{
(
336
,
336
):
576
,
}
for
(
h
,
w
),
f
in
image_hw_to_feature_size
.
items
():
for
input_type
,
input_shape
in
[
(
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
(
1
,
3
,
h
,
w
)),
(
VisionLanguageConfig
.
ImageInputType
.
IMAGE_FEATURES
,
(
1
,
f
,
1024
)),
]:
yield
(
model_name
,
VisionLanguageConfig
(
image_input_type
=
input_type
,
image_feature_size
=
f
,
image_token_id
=
32000
,
image_input_shape
=
input_shape
,
image_processor
=
model_name
,
image_processor_revision
=
None
))
model_and_vl_config
=
[
*
iter_llava_configs
(
"llava-hf/llava-1.5-7b-hf"
),
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
],
vlm_config
:
VisionLanguageConfig
,
model_id
:
str
):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
vision_language_config
.
image_token_id
)
image_token_str_len
=
len
(
image_token_str
)
input_ids
,
output_str
=
vllm_output
sanitized_input_ids
=
input_ids
[
0
:
2
]
+
input_ids
[
2
+
vision_language_config
.
image_feature_size
-
1
:]
sanitzied_output_str
=
output_str
[
vision_language_config
.
image_feature_size
*
image_token_str_len
:]
return
sanitized_input_ids
,
sanitzied_output_str
image_token_id
=
vlm_config
.
image_token_id
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
image_token_id
)
hf_input_ids
=
[
input_id
for
idx
,
input_id
in
enumerate
(
input_ids
)
if
input_id
!=
image_token_id
or
input_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
\
.
replace
(
image_token_str
*
vlm_config
.
image_feature_size
,
""
)
@
pytest
.
mark
.
parametrize
(
"worker_use_ray"
,
[
False
])
return
hf_input_ids
,
hf_output_str
# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_image_prompts
,
hf_images
,
vllm_image_prompts
,
vllm_images
,
model_and_config
:
tuple
,
dtype
:
str
,
max_tokens
:
int
,
worker_use_ray
:
bool
)
->
None
:
def
test_models
(
hf_runner
,
vllm_runner
,
hf_images
,
vllm_images
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the
raw
images as input.
For vllm runner, we provide
image tensor
s and corresponding
For huggingface runner, we provide the
PIL
images as input.
For vllm runner, we provide
MultiModalData object
s and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id
,
vision_language_config
=
model_and_config
hf_model
=
hf_runner
(
model_id
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
hf_image_prompts
,
max_tokens
,
images
=
hf_images
)
del
hf_model
vllm_model
=
vllm_runner
(
model_id
,
dtype
=
dtype
,
worker_use_ray
=
worker_use_ray
,
**
as_dict
(
vision_language_config
))
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_image_prompts
,
model_id
,
vlm_config
=
model_and_config
with
hf_runner
(
model_id
,
dtype
=
dtype
,
is_vision_model
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
HF_IMAGE_PROMPTS
,
max_tokens
,
images
=
vllm_images
)
del
vllm_model
images
=
hf_images
)
vllm_image_prompts
=
[
p
.
replace
(
"<image>"
,
"<image>"
*
vlm_config
.
image_feature_size
)
for
p
in
HF_IMAGE_PROMPTS
]
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
with
vllm_runner
(
model_id
,
dtype
=
dtype
,
enforce_eager
=
True
,
**
vlm_config
.
as_cli_args_dict
())
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_image_prompts
,
max_tokens
,
images
=
vllm_images
)
for
i
in
range
(
len
(
hf_image_prompts
)):
for
i
in
range
(
len
(
HF_IMAGE_PROMPTS
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
sanitize_vllm
_output
(
vllm_outputs
[
i
],
v
ision_language
_config
,
model_id
)
vllm_output_ids
,
vllm_output_str
=
vllm_to_hf
_output
(
vllm_outputs
[
i
],
v
lm
_config
,
model_id
)
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
...
...
tests/models/test_llava_next.py
0 → 100644
View file @
f48954a4
from
typing
import
List
,
Tuple
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
from
..conftest
import
IMAGE_FILES
pytestmark
=
pytest
.
mark
.
llava
_PREFACE
=
(
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's "
"questions."
)
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
[
f
"
{
_PREFACE
}
<image>
\n
USER: What's the content of the image? ASSISTANT:"
,
f
"
{
_PREFACE
}
<image>
\n
USER: What is the season? ASSISTANT:"
,
]
assert
len
(
HF_IMAGE_PROMPTS
)
==
len
(
IMAGE_FILES
)
def
iter_llava_next_configs
(
model_name
:
str
):
image_hw_to_feature_size
=
{
(
336
,
336
):
1176
,
(
672
,
672
):
2928
,
(
1344
,
336
):
1944
,
(
336
,
1344
):
1890
,
}
for
(
h
,
w
),
f
in
image_hw_to_feature_size
.
items
():
for
input_type
,
input_shape
in
[
(
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
(
1
,
3
,
h
,
w
)),
]:
yield
(
model_name
,
VisionLanguageConfig
(
image_input_type
=
input_type
,
image_feature_size
=
f
,
image_token_id
=
32000
,
image_input_shape
=
input_shape
,
image_processor
=
model_name
,
image_processor_revision
=
None
))
model_and_vl_config
=
[
*
iter_llava_next_configs
(
"llava-hf/llava-v1.6-vicuna-7b-hf"
),
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
],
vlm_config
:
VisionLanguageConfig
,
model_id
:
str
):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
input_ids
,
output_str
=
vllm_output
image_token_id
=
vlm_config
.
image_token_id
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
image_token_id
)
hf_input_ids
=
[
input_id
for
idx
,
input_id
in
enumerate
(
input_ids
)
if
input_id
!=
image_token_id
or
input_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
\
.
replace
(
image_token_str
*
vlm_config
.
image_feature_size
,
" "
)
return
hf_input_ids
,
hf_output_str
@
pytest
.
mark
.
xfail
(
reason
=
"Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement"
)
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_images
,
vllm_images
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id
,
vlm_config
=
model_and_config
with
hf_runner
(
model_id
,
dtype
=
dtype
,
is_vision_model
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
HF_IMAGE_PROMPTS
,
max_tokens
,
images
=
hf_images
)
vllm_image_prompts
=
[
p
.
replace
(
"<image>"
,
"<image>"
*
vlm_config
.
image_feature_size
)
for
p
in
HF_IMAGE_PROMPTS
]
with
vllm_runner
(
model_id
,
dtype
=
dtype
,
# should be greater than image_feature_size
max_model_len
=
4096
,
enforce_eager
=
True
,
**
vlm_config
.
as_cli_args_dict
(),
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_image_prompts
,
max_tokens
,
images
=
vllm_images
)
for
i
in
range
(
len
(
HF_IMAGE_PROMPTS
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_to_hf_output
(
vllm_outputs
[
i
],
vlm_config
,
model_id
)
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/models/test_marlin.py
View file @
f48954a4
...
...
@@ -19,10 +19,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from
.utils
import
check_logprobs_close
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
...
...
@@ -56,20 +59,16 @@ def test_models(
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
marlin_model
=
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"marlin"
)
marlin_outputs
=
marlin_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
marlin_model
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
quantization
=
"gptq"
)
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
gptq_model
with
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"marlin"
)
as
marlin_model
:
marlin_outputs
=
marlin_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
quantization
=
"gptq"
)
as
gptq_model
:
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
...
...
tests/models/test_mistral.py
View file @
f48954a4
...
...
@@ -26,16 +26,13 @@ def test_models(
num_logprobs
:
int
,
)
->
None
:
# TODO(sang): Sliding window should be tested separately.
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
hf_model
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
...
...
tests/models/test_models.py
View file @
f48954a4
...
...
@@ -34,13 +34,11 @@ def test_models(
# To pass the small model tests, we need full precision.
assert
dtype
==
"float"
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
@@ -58,9 +56,8 @@ def test_model_print(
model
:
str
,
dtype
:
str
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
tests/multimodal/__init__.py
0 → 100644
View file @
f48954a4
tests/multimodal/test_processor.py
0 → 100644
View file @
f48954a4
import
numpy
as
np
import
pytest
from
transformers
import
CLIPImageProcessor
,
LlavaNextImageProcessor
from
vllm.config
import
ModelConfig
,
VisionLanguageConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
ImagePixelData
from
..conftest
import
_STR_DTYPE_TO_TORCH_DTYPE
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
def
test_clip_image_processor
(
hf_images
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
hf_processor
=
CLIPImageProcessor
.
from_pretrained
(
MODEL_NAME
)
assert
isinstance
(
hf_processor
,
CLIPImageProcessor
)
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
)
vlm_config
=
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
3
,
IMAGE_HEIGHT
,
IMAGE_WIDTH
),
image_feature_size
=
576
,
image_processor
=
MODEL_NAME
,
image_processor_revision
=
None
,
)
for
image
in
hf_images
:
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
).
to
(
dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
])
vllm_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
assert
hf_result
.
keys
()
==
vllm_result
.
keys
()
for
key
,
hf_tensor
in
hf_result
.
items
():
hf_arr
:
np
.
ndarray
=
hf_tensor
.
numpy
()
vllm_arr
:
np
.
ndarray
=
vllm_result
[
key
].
numpy
()
assert
hf_arr
.
shape
==
vllm_arr
.
shape
,
f
"Failed for key=
{
key
}
"
assert
np
.
allclose
(
hf_arr
,
vllm_arr
),
f
"Failed for key=
{
key
}
"
@
pytest
.
mark
.
xfail
(
reason
=
"Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement"
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
def
test_llava_next_image_processor
(
hf_images
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-v1.6-34b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
hf_processor
=
LlavaNextImageProcessor
.
from_pretrained
(
MODEL_NAME
)
assert
isinstance
(
hf_processor
,
LlavaNextImageProcessor
)
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
)
vlm_config
=
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_token_id
=
64000
,
image_input_shape
=
(
1
,
3
,
IMAGE_HEIGHT
,
IMAGE_WIDTH
),
image_feature_size
=
2928
,
image_processor
=
MODEL_NAME
,
image_processor_revision
=
None
,
)
for
image
in
hf_images
:
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
).
to
(
dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
])
vllm_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
assert
hf_result
.
keys
()
==
vllm_result
.
keys
()
for
key
,
hf_tensor
in
hf_result
.
items
():
hf_arr
:
np
.
ndarray
=
hf_tensor
.
numpy
()
vllm_arr
:
np
.
ndarray
=
vllm_result
[
key
].
numpy
()
assert
hf_arr
.
shape
==
vllm_arr
.
shape
,
f
"Failed for key=
{
key
}
"
assert
np
.
allclose
(
hf_arr
,
vllm_arr
),
f
"Failed for key=
{
key
}
"
@
pytest
.
mark
.
xfail
(
reason
=
"Example image pixels were not processed using HuggingFace"
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_image_pixel_types
(
hf_images
,
vllm_image_tensors
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
)
vlm_config
=
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
3
,
IMAGE_HEIGHT
,
IMAGE_WIDTH
),
image_feature_size
=
576
,
image_processor
=
MODEL_NAME
,
image_processor_revision
=
None
,
)
for
image
,
tensor
in
zip
(
hf_images
,
vllm_image_tensors
):
image_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
tensor_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
tensor
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
assert
image_result
.
keys
()
==
tensor_result
.
keys
()
for
key
,
image_arr
in
image_result
.
items
():
tensor_arr
:
np
.
ndarray
=
tensor_result
[
key
].
numpy
()
assert
image_arr
.
shape
==
tensor_arr
.
shape
,
f
"Failed for key=
{
key
}
"
assert
np
.
allclose
(
image_arr
,
tensor_arr
),
f
"Failed for key=
{
key
}
"
tests/multimodal/test_utils.py
0 → 100644
View file @
f48954a4
import
base64
import
mimetypes
from
tempfile
import
NamedTemporaryFile
from
typing
import
Dict
,
Tuple
import
numpy
as
np
import
pytest
import
pytest_asyncio
from
PIL
import
Image
from
vllm.multimodal.utils
import
ImageFetchAiohttp
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
,
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
async
def
url_images
()
->
Dict
[
str
,
Image
.
Image
]:
return
{
image_url
:
await
ImageFetchAiohttp
.
fetch_image
(
image_url
)
for
image_url
in
TEST_IMAGE_URLS
}
def
get_supported_suffixes
()
->
Tuple
[
str
,
...]:
# We should at least test the file types mentioned in GPT-4 with Vision
OPENAI_SUPPORTED_SUFFIXES
=
(
'.png'
,
'.jpeg'
,
'.jpg'
,
'.webp'
,
'.gif'
)
# Additional file types that are supported by us
EXTRA_SUPPORTED_SUFFIXES
=
(
'.bmp'
,
'.tiff'
)
return
OPENAI_SUPPORTED_SUFFIXES
+
EXTRA_SUPPORTED_SUFFIXES
def
_image_equals
(
a
:
Image
.
Image
,
b
:
Image
.
Image
)
->
bool
:
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
b
.
convert
(
a
.
mode
))).
all
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
async
def
test_fetch_image_base64
(
url_images
:
Dict
[
str
,
Image
.
Image
],
image_url
:
str
,
suffix
:
str
):
url_image
=
url_images
[
image_url
]
try
:
mime_type
=
Image
.
MIME
[
Image
.
registered_extensions
()[
suffix
]]
except
KeyError
:
try
:
mime_type
=
mimetypes
.
types_map
[
suffix
]
except
KeyError
:
pytest
.
skip
(
'No MIME type'
)
with
NamedTemporaryFile
(
suffix
=
suffix
)
as
f
:
try
:
url_image
.
save
(
f
.
name
)
except
Exception
as
e
:
if
e
.
args
[
0
]
==
'cannot write mode RGBA as JPEG'
:
pytest
.
skip
(
'Conversion not supported'
)
raise
base64_image
=
base64
.
b64encode
(
f
.
read
()).
decode
(
"utf-8"
)
data_url
=
f
"data:
{
mime_type
}
;base64,
{
base64_image
}
"
data_image
=
await
ImageFetchAiohttp
.
fetch_image
(
data_url
)
if
_image_equals
(
url_image
,
Image
.
open
(
f
)):
assert
_image_equals
(
url_image
,
data_image
)
else
:
pass
# Lossy format; only check that image can be opened
tests/quantization/test_bitsandbytes.py
0 → 100644
View file @
f48954a4
'''Tests whether bitsandbytes computation is enabled correctly.
Run `pytest tests/quantization/test_bitsandbytes.py`.
'''
import
pytest
import
torch
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
'bitsandbytes'
].
get_min_capability
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
def
test_load_bnb_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
'huggyllama/llama-7b'
,
quantization
=
'bitsandbytes'
,
load_format
=
'bitsandbytes'
,
enforce_eager
=
True
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
# check the weights in MLP & SelfAttention are quantized to torch.uint8
qweight
=
model
.
model
.
layers
[
0
].
mlp
.
gate_up_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected gate_up_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
qweight
=
model
.
model
.
layers
[
0
].
mlp
.
down_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected down_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
qweight
=
model
.
model
.
layers
[
0
].
self_attn
.
o_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected o_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
qweight
=
model
.
model
.
layers
[
0
].
self_attn
.
qkv_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected qkv_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
# some weights should not be quantized
weight
=
model
.
lm_head
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'lm_head weight dtype should not be torch.uint8'
)
weight
=
model
.
model
.
embed_tokens
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'embed_tokens weight dtype should not be torch.uint8'
)
weight
=
model
.
model
.
layers
[
0
].
input_layernorm
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'input_layernorm weight dtype should not be torch.uint8'
)
weight
=
model
.
model
.
layers
[
0
].
post_attention_layernorm
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'input_layernorm weight dtype should not be torch.uint8'
)
# check the output of the model is expected
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
,
max_tokens
=
8
)
prompts
=
[
'That which does not kill us'
,
'To be or not to be,'
]
expected_outputs
=
[
'That which does not kill us makes us stronger.'
,
'To be or not to be, that is the question.'
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
assert
len
(
outputs
)
==
len
(
prompts
)
for
index
in
range
(
len
(
outputs
)):
# compare the first line of the output
actual_output
=
outputs
[
index
][
1
][
0
].
split
(
'
\n
'
,
1
)[
0
]
expected_output
=
expected_outputs
[
index
].
split
(
'
\n
'
,
1
)[
0
]
assert
actual_output
==
expected_output
,
(
f
'Expected:
{
expected_output
}
, but got:
{
actual_output
}
'
)
tests/quantization/test_compressed_tensors.py
View file @
f48954a4
...
...
@@ -5,32 +5,58 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
import
torch
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW8A8StaticTensor
)
CompressedTensorsLinearMethod
,
CompressedTensorsW8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
)
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
):
model_path
=
"nm-testing/tinyllama-one-shot-static-quant-test-compressed"
llm
=
vllm_runner
(
model_path
,
quantization
=
"sparseml"
,
enforce_eager
=
True
)
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
o_proj
=
layer
.
self_attn
.
o_proj
gate_up_proj
=
layer
.
mlp
.
gate_up_proj
down_proj
=
layer
.
mlp
.
down_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
o_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
gate_up_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
down_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8StaticTensor
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
assert
o_proj
.
weight
.
dtype
is
torch
.
int8
assert
gate_up_proj
.
weight
.
dtype
is
torch
.
int8
assert
qkv_proj
.
weight_scale
.
shard_splitter
is
not
None
assert
qkv_proj
.
weight_scale
.
logical_widths
is
not
None
assert
qkv_proj
.
input_scale
.
dtype
is
torch
.
float32
model_path
=
"nm-testing/tinyllama-oneshot-w8a8-static-v2"
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
o_proj
=
layer
.
self_attn
.
o_proj
gate_up_proj
=
layer
.
mlp
.
gate_up_proj
down_proj
=
layer
.
mlp
.
down_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
o_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
gate_up_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
down_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8StaticTensor
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
assert
o_proj
.
weight
.
dtype
is
torch
.
int8
assert
gate_up_proj
.
weight
.
dtype
is
torch
.
int8
assert
qkv_proj
.
weight_scale
.
shard_splitter
is
not
None
assert
qkv_proj
.
weight_scale
.
logical_widths
is
not
None
assert
qkv_proj
.
input_scale
.
dtype
is
torch
.
float32
def
test_compressed_tensors_no_enforce_eager
(
vllm_runner
):
model_path
=
"nm-testing/tinyllama-oneshot-w8a8-static-v2"
with
vllm_runner
(
model_path
)
as
llm
:
sampling_params
=
SamplingParams
()
output
=
llm
.
generate
(
"Hello world!"
,
sampling_params
=
sampling_params
)
assert
output
def
test_compressed_tensors_w8a8_dynanmic_per_token
(
vllm_runner
):
model_path
=
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
with
vllm_runner
(
model_path
,
enforce_eager
=
True
,
dtype
=
torch
.
float16
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8DynamicToken
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
tests/quantization/test_fp8.py
View file @
f48954a4
...
...
@@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
llm
=
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
Prev
1
…
3
4
5
6
7
8
9
10
11
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment