Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
735 additions
and
266 deletions
+735
-266
tests/lora/test_long_context.py
tests/lora/test_long_context.py
+13
-10
tests/lora/test_utils.py
tests/lora/test_utils.py
+13
-1
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+42
-42
tests/models/test_aqlm.py
tests/models/test_aqlm.py
+10
-8
tests/models/test_big_models.py
tests/models/test_big_models.py
+17
-14
tests/models/test_embedding.py
tests/models/test_embedding.py
+4
-6
tests/models/test_fp8.py
tests/models/test_fp8.py
+7
-4
tests/models/test_gptq_marlin.py
tests/models/test_gptq_marlin.py
+24
-24
tests/models/test_gptq_marlin_24.py
tests/models/test_gptq_marlin_24.py
+16
-17
tests/models/test_llava.py
tests/models/test_llava.py
+74
-71
tests/models/test_llava_next.py
tests/models/test_llava_next.py
+123
-0
tests/models/test_marlin.py
tests/models/test_marlin.py
+17
-18
tests/models/test_mistral.py
tests/models/test_mistral.py
+6
-9
tests/models/test_models.py
tests/models/test_models.py
+9
-12
tests/multimodal/__init__.py
tests/multimodal/__init__.py
+0
-0
tests/multimodal/test_processor.py
tests/multimodal/test_processor.py
+149
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+75
-0
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+80
-0
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+51
-25
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+5
-5
No files found.
tests/lora/test_long_context.py
View file @
f48954a4
...
@@ -102,22 +102,21 @@ def batched_generate(
...
@@ -102,22 +102,21 @@ def batched_generate(
return
[
outputs
[
i
].
outputs
[
0
].
text
.
strip
()
for
i
in
range
(
len
(
outputs
))]
return
[
outputs
[
i
].
outputs
[
0
].
text
.
strip
()
for
i
in
range
(
len
(
outputs
))]
@
pytest
.
fixture
@
pytest
.
fixture
(
scope
=
"module"
)
def
lora_llm
(
long_context_infos
):
def
lora_llm
(
long_context_infos
):
scaling_factors
=
[
scaling_factors
=
[
context_len_to_scaling_factor
[
info
[
"context_length"
]]
context_len_to_scaling_factor
[
info
[
"context_length"
]]
for
info
in
long_context_infos
.
values
()
for
info
in
long_context_infos
.
values
()
]
]
llm
=
vllm
.
LLM
(
llm
=
vllm
.
LLM
(
"meta-llama/Llama-2-13b-chat-hf"
,
"meta-llama/Llama-2-13b-chat-hf"
,
enable_lora
=
True
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
2
,
max_loras
=
2
,
long_lora_scaling_factors
=
tuple
(
scaling_factors
),
long_lora_scaling_factors
=
tuple
(
scaling_factors
),
max_num_batched_tokens
=
4096
*
8
,
max_num_batched_tokens
=
4096
*
8
,
tensor_parallel_size
=
4
,
tensor_parallel_size
=
4
,
distributed_executor_backend
=
"mp"
)
)
yield
llm
yield
llm
del
llm
del
llm
...
@@ -154,6 +153,7 @@ def test_rotary_emb_replaced(dist_init):
...
@@ -154,6 +153,7 @@ def test_rotary_emb_replaced(dist_init):
assert
rotary_emb_count
==
32
assert
rotary_emb_count
==
32
@
pytest
.
mark
.
skip_global_cleanup
def
test_batched_rope_kernel
(
lora_llm
,
long_context_infos
):
def
test_batched_rope_kernel
(
lora_llm
,
long_context_infos
):
"""We test the batched kernel by comparing the results of batched an
"""We test the batched kernel by comparing the results of batched an
non-batched generation.
non-batched generation.
...
@@ -188,6 +188,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
...
@@ -188,6 +188,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
f
"same:
\n
{
batched
}
\n
{
non_batched
}
"
)
f
"same:
\n
{
batched
}
\n
{
non_batched
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_self_consistency
(
lora_llm
,
long_context_infos
):
def
test_self_consistency
(
lora_llm
,
long_context_infos
):
"""We test consistency of the batched kernel by permuting batched
"""We test consistency of the batched kernel by permuting batched
inputs and comparing the results to the non-permuted batched results.
inputs and comparing the results to the non-permuted batched results.
...
@@ -227,6 +228,7 @@ def test_self_consistency(lora_llm, long_context_infos):
...
@@ -227,6 +228,7 @@ def test_self_consistency(lora_llm, long_context_infos):
f
"
\n
{
permutated_batched_results
[
permutation
[
i
]]
}
"
)
f
"
\n
{
permutated_batched_results
[
permutation
[
i
]]
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_quality
(
lora_llm
,
long_context_infos
):
def
test_quality
(
lora_llm
,
long_context_infos
):
"""We test the quality of the answers given by the LoRA model by
"""We test the quality of the answers given by the LoRA model by
comparing the generated text to the merged model's outputs.
comparing the generated text to the merged model's outputs.
...
@@ -257,6 +259,7 @@ def test_quality(lora_llm, long_context_infos):
...
@@ -257,6 +259,7 @@ def test_quality(lora_llm, long_context_infos):
assert
np
.
mean
(
scores
)
>
0.5
assert
np
.
mean
(
scores
)
>
0.5
@
pytest
.
mark
.
skip_global_cleanup
def
test_max_len
(
lora_llm
,
long_context_infos
):
def
test_max_len
(
lora_llm
,
long_context_infos
):
"""Test that we raise an ValueError when the input of a given LoRA
"""Test that we raise an ValueError when the input of a given LoRA
model exceeds the maximum length."""
model exceeds the maximum length."""
...
...
tests/lora/test_utils.py
View file @
f48954a4
from
collections
import
OrderedDict
from
collections
import
OrderedDict
import
pytest
from
torch
import
nn
from
torch
import
nn
from
vllm.lora.utils
import
parse_fine_tuned_lora_name
,
replace_submodule
from
vllm.lora.utils
import
parse_fine_tuned_lora_name
,
replace_submodule
from
vllm.utils
import
LRUCache
from
vllm.utils
import
LRUCache
def
test_parse_fine_tuned_lora_name
():
def
test_parse_fine_tuned_lora_name
_valid
():
fixture
=
{
fixture
=
{
(
"base_model.model.lm_head.lora_A.weight"
,
"lm_head"
,
True
),
(
"base_model.model.lm_head.lora_A.weight"
,
"lm_head"
,
True
),
(
"base_model.model.lm_head.lora_B.weight"
,
"lm_head"
,
False
),
(
"base_model.model.lm_head.lora_B.weight"
,
"lm_head"
,
False
),
...
@@ -35,6 +36,17 @@ def test_parse_fine_tuned_lora_name():
...
@@ -35,6 +36,17 @@ def test_parse_fine_tuned_lora_name():
assert
(
module_name
,
is_lora_a
)
==
parse_fine_tuned_lora_name
(
name
)
assert
(
module_name
,
is_lora_a
)
==
parse_fine_tuned_lora_name
(
name
)
def
test_parse_fine_tuned_lora_name_invalid
():
fixture
=
{
"weight"
,
"base_model.weight"
,
"base_model.model.weight"
,
}
for
name
in
fixture
:
with
pytest
.
raises
(
ValueError
,
match
=
"unsupported LoRA weight"
):
parse_fine_tuned_lora_name
(
name
)
def
test_replace_submodule
():
def
test_replace_submodule
():
model
=
nn
.
Sequential
(
model
=
nn
.
Sequential
(
OrderedDict
([
OrderedDict
([
...
...
tests/metrics/test_metrics.py
View file @
f48954a4
...
@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
...
@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
None
:
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
with
vllm_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
gpu_memory_utilization
=
0.4
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
prompt_token_counts
=
[
len
(
tokenizer
.
encode
(
p
))
for
p
in
example_prompts
]
prompt_token_counts
=
[
# This test needs at least 2 prompts in a batch of different lengths to
len
(
tokenizer
.
encode
(
p
))
for
p
in
example_prompts
# verify their token count is correct despite padding.
]
assert
len
(
example_prompts
)
>
1
,
"at least 2 prompts are required"
# This test needs at least 2 prompts in a batch of different lengths to
assert
prompt_token_counts
[
0
]
!=
prompt_token_counts
[
1
],
(
# verify their token count is correct despite padding.
"prompts of different lengths are required"
)
assert
len
(
example_prompts
)
>
1
,
"at least 2 prompts are required"
vllm_prompt_token_count
=
sum
(
prompt_token_counts
)
assert
prompt_token_counts
[
0
]
!=
prompt_token_counts
[
1
],
(
"prompts of different lengths are required"
)
_
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_prompt_token_count
=
sum
(
prompt_token_counts
)
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metric_count
=
stat_logger
.
metrics
.
counter_prompt_tokens
.
labels
(
_
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
**
stat_logger
.
labels
).
_value
.
get
()
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metric_count
=
stat_logger
.
metrics
.
counter_prompt_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
assert
vllm_prompt_token_count
==
metric_count
,
(
assert
vllm_prompt_token_count
==
metric_count
,
(
f
"prompt token count:
{
vllm_prompt_token_count
!
r
}
\n
"
f
"prompt token count:
{
vllm_prompt_token_count
!
r
}
\n
"
...
@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
...
@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
None
:
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
with
vllm_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
gpu_memory_utilization
=
0.4
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metric_count
=
stat_logger
.
metrics
.
counter_generation_tokens
.
labels
(
metric_count
=
stat_logger
.
metrics
.
counter_generation_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
**
stat_logger
.
labels
).
_value
.
get
()
vllm_generation_count
=
0
vllm_generation_count
=
0
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
prompt_ids
=
tokenizer
.
encode
(
example_prompts
[
i
])
prompt_ids
=
tokenizer
.
encode
(
example_prompts
[
i
])
# vllm_output_ids contains both prompt tokens and generation tokens.
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count
+=
len
(
vllm_output_ids
)
-
len
(
prompt_ids
)
vllm_generation_count
+=
len
(
vllm_output_ids
)
-
len
(
prompt_ids
)
assert
vllm_generation_count
==
metric_count
,
(
assert
vllm_generation_count
==
metric_count
,
(
f
"generation token count:
{
vllm_generation_count
!
r
}
\n
"
f
"generation token count:
{
vllm_generation_count
!
r
}
\n
"
...
@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
...
@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
[
None
,
[],
[
"ModelName0"
],
[
"ModelName0"
,
"ModelName1"
,
"ModelName2"
]])
[
None
,
[],
[
"ModelName0"
],
[
"ModelName0"
,
"ModelName1"
,
"ModelName2"
]])
def
test_metric_set_tag_model_name
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
def
test_metric_set_tag_model_name
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
served_model_name
:
List
[
str
])
->
None
:
served_model_name
:
List
[
str
])
->
None
:
vllm_model
=
vllm_runner
(
model
,
with
vllm_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.3
,
gpu_memory_utilization
=
0.3
,
served_model_name
=
served_model_name
)
served_model_name
=
served_model_name
)
as
vllm_model
:
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_logger
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
del
vllm_model
if
served_model_name
is
None
or
served_model_name
==
[]:
if
served_model_name
is
None
or
served_model_name
==
[]:
assert
metrics_tag_content
==
model
,
(
assert
metrics_tag_content
==
model
,
(
...
...
tests/models/test_aqlm.py
View file @
f48954a4
...
@@ -8,10 +8,13 @@ import torch
...
@@ -8,10 +8,13 @@ import torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
aqlm_not_supported
=
True
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
if
torch
.
cuda
.
is_available
():
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
# In this test we hardcode prompts and generations for the model so we don't
# In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency
# need to require the AQLM package as a dependency
...
@@ -79,10 +82,9 @@ def test_models(
...
@@ -79,10 +82,9 @@ def test_models(
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
None
:
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
max_tokens
,
example_prompts
,
max_tokens
,
num_logprobs
)
num_logprobs
)
# loop through the prompts to compare against the ground truth generations
# loop through the prompts to compare against the ground truth generations
for
prompt_idx
in
range
(
len
(
example_prompts
)):
for
prompt_idx
in
range
(
len
(
example_prompts
)):
...
...
tests/models/test_big_models.py
View file @
f48954a4
...
@@ -5,6 +5,7 @@ This tests bigger models and use half precision.
...
@@ -5,6 +5,7 @@ This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
Run `pytest tests/models/test_big_models.py`.
"""
"""
import
pytest
import
pytest
import
torch
MODELS
=
[
MODELS
=
[
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Llama-2-7b-hf"
,
...
@@ -16,9 +17,14 @@ MODELS = [
...
@@ -16,9 +17,14 @@ MODELS = [
# "Qwen/Qwen1.5-0.5B" # Broken,
# "Qwen/Qwen1.5-0.5B" # Broken,
]
]
#TODO: remove this after CPU float16 support ready
target_dtype
=
"float"
if
torch
.
cuda
.
is_available
():
target_dtype
=
"half"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_models
(
def
test_models
(
hf_runner
,
hf_runner
,
...
@@ -28,13 +34,11 @@ def test_models(
...
@@ -28,13 +34,11 @@ def test_models(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
None
:
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
@@ -46,15 +50,14 @@ def test_models(
...
@@ -46,15 +50,14 @@ def test_models(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
def
test_model_print
(
def
test_model_print
(
vllm_runner
,
vllm_runner
,
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
model_runner
.
model
)
del
vllm_model
tests/models/test_embedding.py
View file @
f48954a4
...
@@ -28,13 +28,11 @@ def test_models(
...
@@ -28,13 +28,11 @@ def test_models(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
with
hf_runner
(
model
,
dtype
=
dtype
,
is_embedding_model
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
del
vllm_model
similarities
=
compare_embeddings
(
hf_outputs
,
vllm_outputs
)
similarities
=
compare_embeddings
(
hf_outputs
,
vllm_outputs
)
all_similarities
=
torch
.
stack
(
similarities
)
all_similarities
=
torch
.
stack
(
similarities
)
...
...
tests/models/test_fp8.py
View file @
f48954a4
...
@@ -67,10 +67,13 @@ EXPECTED_STRS_MAP = {
...
@@ -67,10 +67,13 @@ EXPECTED_STRS_MAP = {
},
},
}
}
capability
=
torch
.
cuda
.
get_device_capability
()
fp8_not_supported
=
True
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
fp8_not_supported
=
(
capability
<
if
torch
.
cuda
.
is_available
():
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
fp8_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
@
pytest
.
mark
.
skipif
(
fp8_not_supported
,
@
pytest
.
mark
.
skipif
(
fp8_not_supported
,
...
...
tests/models/test_gptq_marlin.py
View file @
f48954a4
...
@@ -22,10 +22,13 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
...
@@ -22,10 +22,13 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN
=
1024
MAX_MODEL_LEN
=
1024
capability
=
torch
.
cuda
.
get_device_capability
()
gptq_marlin_not_supported
=
True
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
if
torch
.
cuda
.
is_available
():
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
MODELS
=
[
MODELS
=
[
# act_order==False, group_size=channelwise
# act_order==False, group_size=channelwise
...
@@ -67,32 +70,29 @@ def test_models(
...
@@ -67,32 +70,29 @@ def test_models(
model_name
,
revision
=
model
model_name
,
revision
=
model
# Run marlin.
# Run marlin.
gptq_marlin_model
=
vllm_runner
(
model_name
=
model_name
,
with
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
revision
=
revision
,
dtype
=
dtype
,
dtype
=
dtype
,
quantization
=
"marlin"
,
quantization
=
"marlin"
,
max_model_len
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
tensor_parallel_size
=
1
)
as
gptq_marlin_model
:
gptq_marlin_outputs
=
gptq_marlin_model
.
generate_greedy_logprobs
(
gptq_marlin_outputs
=
gptq_marlin_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
del
gptq_marlin_model
_ROPE_DICT
.
clear
()
# clear rope cache to avoid rope dtype error
_ROPE_DICT
.
clear
()
# clear rope cache to avoid rope dtype error
# Run gptq.
# Run gptq.
# The naive gptq kernel doesn't support bf16 yet.
# The naive gptq kernel doesn't support bf16 yet.
# Here we always compare fp16/bf16 gpt marlin kernel
# Here we always compare fp16/bf16 gpt marlin kernel
# to fp16 gptq kernel.
# to fp16 gptq kernel.
gptq_model
=
vllm_runner
(
model_name
=
model_name
,
with
vllm_runner
(
model_name
=
model_name
,
revision
=
revision
,
revision
=
revision
,
dtype
=
"half"
,
dtype
=
"half"
,
quantization
=
"gptq"
,
quantization
=
"gptq"
,
max_model_len
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
1
)
tensor_parallel_size
=
1
)
as
gptq_model
:
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
[:
-
1
],
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
max_tokens
,
example_prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
num_logprobs
)
del
gptq_model
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
outputs_0_lst
=
gptq_outputs
,
...
...
tests/models/test_gptq_marlin_24.py
View file @
f48954a4
...
@@ -14,10 +14,13 @@ import torch
...
@@ -14,10 +14,13 @@ import torch
from
tests.models.utils
import
check_logprobs_close
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
marlin_not_supported
=
True
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
if
torch
.
cuda
.
is_available
():
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
@
dataclass
...
@@ -58,20 +61,16 @@ def test_models(
...
@@ -58,20 +61,16 @@ def test_models(
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
None
:
)
->
None
:
marlin_24_model
=
vllm_runner
(
model_pair
.
model_marlin
,
with
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
dtype
=
dtype
,
quantization
=
"gptq_marlin_24"
)
quantization
=
"gptq_marlin_24"
)
as
marlin_24_model
:
marlin_24_outputs
=
marlin_24_model
.
generate_greedy_logprobs
(
marlin_24_outputs
=
marlin_24_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
del
marlin_24_model
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
with
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
dtype
=
dtype
,
quantization
=
"gptq"
)
as
gptq_model
:
quantization
=
"gptq"
)
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
example_prompts
,
max_tokens
,
num_logprobs
)
max_tokens
,
num_logprobs
)
del
gptq_model
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
outputs_0_lst
=
gptq_outputs
,
...
...
tests/models/test_llava.py
View file @
f48954a4
import
gc
from
typing
import
List
,
Tuple
from
dataclasses
import
fields
from
enum
import
Enum
from
typing
import
Dict
,
List
,
Tuple
import
pytest
import
pytest
import
torch
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
from
vllm.config
import
VisionLanguageConfig
model_and_vl_config
=
[
from
..conftest
import
IMAGE_FILES
(
"llava-hf/llava-1.5-7b-hf"
,
VisionLanguageConfig
(
pytestmark
=
pytest
.
mark
.
llava
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_feature_size
=
576
,
# The image token is placed before "user" on purpose so that the test can pass
image_token_id
=
32000
,
HF_IMAGE_PROMPTS
=
[
image_input_shape
=
(
1
,
3
,
336
,
336
))),
"<image>
\n
USER: What's the content of the image?
\n
ASSISTANT:"
,
(
"llava-hf/llava-1.5-7b-hf"
,
"<image>
\n
USER: What is the season?
\n
ASSISTANT:"
,
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
IMAGE_FEATURES
,
image_feature_size
=
576
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
576
,
1024
)))
]
]
assert
len
(
HF_IMAGE_PROMPTS
)
==
len
(
IMAGE_FILES
)
def
as_dict
(
vision_language_config
:
VisionLanguageConfig
)
->
Dict
:
"""Flatten vision language config to pure args.
Compatible with what llm entrypoint expects.
def
iter_llava_configs
(
model_name
:
str
):
"""
image_hw_to_feature_size
=
{
result
=
{}
(
336
,
336
):
576
,
for
field
in
fields
(
vision_language_config
):
}
value
=
getattr
(
vision_language_config
,
field
.
name
)
if
isinstance
(
value
,
Enum
):
for
(
h
,
w
),
f
in
image_hw_to_feature_size
.
items
():
result
[
field
.
name
]
=
value
.
name
.
lower
()
for
input_type
,
input_shape
in
[
elif
isinstance
(
value
,
tuple
):
(
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
(
1
,
3
,
h
,
w
)),
result
[
field
.
name
]
=
","
.
join
([
str
(
item
)
for
item
in
value
])
(
VisionLanguageConfig
.
ImageInputType
.
IMAGE_FEATURES
,
(
1
,
f
,
1024
)),
else
:
]:
result
[
field
.
name
]
=
value
yield
(
model_name
,
return
result
VisionLanguageConfig
(
image_input_type
=
input_type
,
image_feature_size
=
f
,
image_token_id
=
32000
,
def
sanitize_vllm_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
],
image_input_shape
=
input_shape
,
vision_language_config
:
VisionLanguageConfig
,
image_processor
=
model_name
,
model_id
:
str
):
image_processor_revision
=
None
))
model_and_vl_config
=
[
*
iter_llava_configs
(
"llava-hf/llava-1.5-7b-hf"
),
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
],
vlm_config
:
VisionLanguageConfig
,
model_id
:
str
):
"""Sanitize vllm output to be comparable with hf output.
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
"""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
vision_language_config
.
image_token_id
)
image_token_str_len
=
len
(
image_token_str
)
input_ids
,
output_str
=
vllm_output
input_ids
,
output_str
=
vllm_output
sanitized_input_ids
=
input_ids
[
0
:
2
]
+
input_ids
[
2
+
vision_language_config
image_token_id
=
vlm_config
.
image_token_id
.
image_feature_size
-
1
:]
sanitzied_output_str
=
output_str
[
vision_language_config
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_feature_size
*
image_token_str
=
tokenizer
.
decode
(
image_token_id
)
image_token_str_len
:]
return
sanitized_input_ids
,
sanitzied_output_str
hf_input_ids
=
[
input_id
for
idx
,
input_id
in
enumerate
(
input_ids
)
if
input_id
!=
image_token_id
or
input_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
\
.
replace
(
image_token_str
*
vlm_config
.
image_feature_size
,
""
)
@
pytest
.
mark
.
parametrize
(
"worker_use_ray"
,
[
False
])
return
hf_input_ids
,
hf_output_str
# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_image_prompts
,
hf_images
,
def
test_models
(
hf_runner
,
vllm_runner
,
hf_images
,
vllm_images
,
vllm_image_prompts
,
vllm_images
,
model_and_config
:
tuple
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
dtype
:
str
,
max_tokens
:
int
,
worker_use_ray
:
bool
)
->
None
:
"""Inference result should be the same between hf and vllm.
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the
raw
images as input.
For huggingface runner, we provide the
PIL
images as input.
For vllm runner, we provide
image tensor
s and corresponding
For vllm runner, we provide
MultiModalData object
s and corresponding
vision language config as input.
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
The text output is sanitized to be able to compare with hf.
"""
"""
model_id
,
vision_language_config
=
model_and_config
model_id
,
vlm_config
=
model_and_config
hf_model
=
hf_runner
(
model_id
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
hf_image_prompts
,
with
hf_runner
(
model_id
,
dtype
=
dtype
,
is_vision_model
=
True
)
as
hf_model
:
max_tokens
,
hf_outputs
=
hf_model
.
generate_greedy
(
HF_IMAGE_PROMPTS
,
images
=
hf_images
)
del
hf_model
vllm_model
=
vllm_runner
(
model_id
,
dtype
=
dtype
,
worker_use_ray
=
worker_use_ray
,
**
as_dict
(
vision_language_config
))
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_image_prompts
,
max_tokens
,
max_tokens
,
images
=
vllm_images
)
images
=
hf_images
)
del
vllm_model
vllm_image_prompts
=
[
p
.
replace
(
"<image>"
,
"<image>"
*
vlm_config
.
image_feature_size
)
for
p
in
HF_IMAGE_PROMPTS
]
gc
.
collect
()
with
vllm_runner
(
model_id
,
torch
.
cuda
.
empty_cache
()
dtype
=
dtype
,
enforce_eager
=
True
,
**
vlm_config
.
as_cli_args_dict
())
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_image_prompts
,
max_tokens
,
images
=
vllm_images
)
for
i
in
range
(
len
(
hf_image_prompts
)):
for
i
in
range
(
len
(
HF_IMAGE_PROMPTS
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
sanitize_vllm
_output
(
vllm_output_ids
,
vllm_output_str
=
vllm_to_hf
_output
(
vllm_outputs
[
i
],
v
ision_language
_config
,
model_id
)
vllm_outputs
[
i
],
v
lm
_config
,
model_id
)
assert
hf_output_str
==
vllm_output_str
,
(
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
assert
hf_output_ids
==
vllm_output_ids
,
(
...
...
tests/models/test_llava_next.py
0 → 100644
View file @
f48954a4
from
typing
import
List
,
Tuple
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
from
..conftest
import
IMAGE_FILES
pytestmark
=
pytest
.
mark
.
llava
_PREFACE
=
(
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's "
"questions."
)
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
[
f
"
{
_PREFACE
}
<image>
\n
USER: What's the content of the image? ASSISTANT:"
,
f
"
{
_PREFACE
}
<image>
\n
USER: What is the season? ASSISTANT:"
,
]
assert
len
(
HF_IMAGE_PROMPTS
)
==
len
(
IMAGE_FILES
)
def
iter_llava_next_configs
(
model_name
:
str
):
image_hw_to_feature_size
=
{
(
336
,
336
):
1176
,
(
672
,
672
):
2928
,
(
1344
,
336
):
1944
,
(
336
,
1344
):
1890
,
}
for
(
h
,
w
),
f
in
image_hw_to_feature_size
.
items
():
for
input_type
,
input_shape
in
[
(
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
(
1
,
3
,
h
,
w
)),
]:
yield
(
model_name
,
VisionLanguageConfig
(
image_input_type
=
input_type
,
image_feature_size
=
f
,
image_token_id
=
32000
,
image_input_shape
=
input_shape
,
image_processor
=
model_name
,
image_processor_revision
=
None
))
model_and_vl_config
=
[
*
iter_llava_next_configs
(
"llava-hf/llava-v1.6-vicuna-7b-hf"
),
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
],
vlm_config
:
VisionLanguageConfig
,
model_id
:
str
):
"""Sanitize vllm output to be comparable with hf output.
The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
input_ids
,
output_str
=
vllm_output
image_token_id
=
vlm_config
.
image_token_id
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
image_token_id
)
hf_input_ids
=
[
input_id
for
idx
,
input_id
in
enumerate
(
input_ids
)
if
input_id
!=
image_token_id
or
input_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
\
.
replace
(
image_token_str
*
vlm_config
.
image_feature_size
,
" "
)
return
hf_input_ids
,
hf_output_str
@
pytest
.
mark
.
xfail
(
reason
=
"Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement"
)
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_images
,
vllm_images
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
model_id
,
vlm_config
=
model_and_config
with
hf_runner
(
model_id
,
dtype
=
dtype
,
is_vision_model
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
HF_IMAGE_PROMPTS
,
max_tokens
,
images
=
hf_images
)
vllm_image_prompts
=
[
p
.
replace
(
"<image>"
,
"<image>"
*
vlm_config
.
image_feature_size
)
for
p
in
HF_IMAGE_PROMPTS
]
with
vllm_runner
(
model_id
,
dtype
=
dtype
,
# should be greater than image_feature_size
max_model_len
=
4096
,
enforce_eager
=
True
,
**
vlm_config
.
as_cli_args_dict
(),
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_image_prompts
,
max_tokens
,
images
=
vllm_images
)
for
i
in
range
(
len
(
HF_IMAGE_PROMPTS
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_to_hf_output
(
vllm_outputs
[
i
],
vlm_config
,
model_id
)
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/models/test_marlin.py
View file @
f48954a4
...
@@ -19,10 +19,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
...
@@ -19,10 +19,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from
.utils
import
check_logprobs_close
from
.utils
import
check_logprobs_close
capability
=
torch
.
cuda
.
get_device_capability
()
marlin_not_supported
=
True
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
if
torch
.
cuda
.
is_available
():
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
@
dataclass
...
@@ -56,20 +59,16 @@ def test_models(
...
@@ -56,20 +59,16 @@ def test_models(
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
None
:
)
->
None
:
marlin_model
=
vllm_runner
(
model_pair
.
model_marlin
,
with
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
dtype
=
dtype
,
quantization
=
"marlin"
)
quantization
=
"marlin"
)
as
marlin_model
:
marlin_outputs
=
marlin_model
.
generate_greedy_logprobs
(
marlin_outputs
=
marlin_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
del
marlin_model
with
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
quantization
=
"gptq"
)
as
gptq_model
:
dtype
=
dtype
,
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
quantization
=
"gptq"
)
example_prompts
,
max_tokens
,
num_logprobs
)
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
del
gptq_model
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
outputs_0_lst
=
gptq_outputs
,
...
...
tests/models/test_mistral.py
View file @
f48954a4
...
@@ -26,16 +26,13 @@ def test_models(
...
@@ -26,16 +26,13 @@ def test_models(
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
None
:
)
->
None
:
# TODO(sang): Sliding window should be tested separately.
# TODO(sang): Sliding window should be tested separately.
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
max_tokens
,
example_prompts
,
max_tokens
,
num_logprobs
)
num_logprobs
)
del
vllm_model
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
outputs_1_lst
=
vllm_outputs
,
...
...
tests/models/test_models.py
View file @
f48954a4
...
@@ -34,13 +34,11 @@ def test_models(
...
@@ -34,13 +34,11 @@ def test_models(
# To pass the small model tests, we need full precision.
# To pass the small model tests, we need full precision.
assert
dtype
==
"float"
assert
dtype
==
"float"
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
@@ -58,9 +56,8 @@ def test_model_print(
...
@@ -58,9 +56,8 @@ def test_model_print(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
model_runner
.
model
)
del
vllm_model
tests/multimodal/__init__.py
0 → 100644
View file @
f48954a4
tests/multimodal/test_processor.py
0 → 100644
View file @
f48954a4
import
numpy
as
np
import
pytest
from
transformers
import
CLIPImageProcessor
,
LlavaNextImageProcessor
from
vllm.config
import
ModelConfig
,
VisionLanguageConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
ImagePixelData
from
..conftest
import
_STR_DTYPE_TO_TORCH_DTYPE
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
def
test_clip_image_processor
(
hf_images
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
hf_processor
=
CLIPImageProcessor
.
from_pretrained
(
MODEL_NAME
)
assert
isinstance
(
hf_processor
,
CLIPImageProcessor
)
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
)
vlm_config
=
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
3
,
IMAGE_HEIGHT
,
IMAGE_WIDTH
),
image_feature_size
=
576
,
image_processor
=
MODEL_NAME
,
image_processor_revision
=
None
,
)
for
image
in
hf_images
:
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
).
to
(
dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
])
vllm_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
assert
hf_result
.
keys
()
==
vllm_result
.
keys
()
for
key
,
hf_tensor
in
hf_result
.
items
():
hf_arr
:
np
.
ndarray
=
hf_tensor
.
numpy
()
vllm_arr
:
np
.
ndarray
=
vllm_result
[
key
].
numpy
()
assert
hf_arr
.
shape
==
vllm_arr
.
shape
,
f
"Failed for key=
{
key
}
"
assert
np
.
allclose
(
hf_arr
,
vllm_arr
),
f
"Failed for key=
{
key
}
"
@
pytest
.
mark
.
xfail
(
reason
=
"Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement"
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
def
test_llava_next_image_processor
(
hf_images
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-v1.6-34b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
hf_processor
=
LlavaNextImageProcessor
.
from_pretrained
(
MODEL_NAME
)
assert
isinstance
(
hf_processor
,
LlavaNextImageProcessor
)
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
)
vlm_config
=
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_token_id
=
64000
,
image_input_shape
=
(
1
,
3
,
IMAGE_HEIGHT
,
IMAGE_WIDTH
),
image_feature_size
=
2928
,
image_processor
=
MODEL_NAME
,
image_processor_revision
=
None
,
)
for
image
in
hf_images
:
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
).
to
(
dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
])
vllm_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
assert
hf_result
.
keys
()
==
vllm_result
.
keys
()
for
key
,
hf_tensor
in
hf_result
.
items
():
hf_arr
:
np
.
ndarray
=
hf_tensor
.
numpy
()
vllm_arr
:
np
.
ndarray
=
vllm_result
[
key
].
numpy
()
assert
hf_arr
.
shape
==
vllm_arr
.
shape
,
f
"Failed for key=
{
key
}
"
assert
np
.
allclose
(
hf_arr
,
vllm_arr
),
f
"Failed for key=
{
key
}
"
@
pytest
.
mark
.
xfail
(
reason
=
"Example image pixels were not processed using HuggingFace"
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_image_pixel_types
(
hf_images
,
vllm_image_tensors
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
dtype
,
revision
=
None
,
)
vlm_config
=
VisionLanguageConfig
(
image_input_type
=
VisionLanguageConfig
.
ImageInputType
.
PIXEL_VALUES
,
image_token_id
=
32000
,
image_input_shape
=
(
1
,
3
,
IMAGE_HEIGHT
,
IMAGE_WIDTH
),
image_feature_size
=
576
,
image_processor
=
MODEL_NAME
,
image_processor_revision
=
None
,
)
for
image
,
tensor
in
zip
(
hf_images
,
vllm_image_tensors
):
image_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
tensor_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
tensor
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
assert
image_result
.
keys
()
==
tensor_result
.
keys
()
for
key
,
image_arr
in
image_result
.
items
():
tensor_arr
:
np
.
ndarray
=
tensor_result
[
key
].
numpy
()
assert
image_arr
.
shape
==
tensor_arr
.
shape
,
f
"Failed for key=
{
key
}
"
assert
np
.
allclose
(
image_arr
,
tensor_arr
),
f
"Failed for key=
{
key
}
"
tests/multimodal/test_utils.py
0 → 100644
View file @
f48954a4
import
base64
import
mimetypes
from
tempfile
import
NamedTemporaryFile
from
typing
import
Dict
,
Tuple
import
numpy
as
np
import
pytest
import
pytest_asyncio
from
PIL
import
Image
from
vllm.multimodal.utils
import
ImageFetchAiohttp
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
,
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
async
def
url_images
()
->
Dict
[
str
,
Image
.
Image
]:
return
{
image_url
:
await
ImageFetchAiohttp
.
fetch_image
(
image_url
)
for
image_url
in
TEST_IMAGE_URLS
}
def
get_supported_suffixes
()
->
Tuple
[
str
,
...]:
# We should at least test the file types mentioned in GPT-4 with Vision
OPENAI_SUPPORTED_SUFFIXES
=
(
'.png'
,
'.jpeg'
,
'.jpg'
,
'.webp'
,
'.gif'
)
# Additional file types that are supported by us
EXTRA_SUPPORTED_SUFFIXES
=
(
'.bmp'
,
'.tiff'
)
return
OPENAI_SUPPORTED_SUFFIXES
+
EXTRA_SUPPORTED_SUFFIXES
def
_image_equals
(
a
:
Image
.
Image
,
b
:
Image
.
Image
)
->
bool
:
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
b
.
convert
(
a
.
mode
))).
all
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
async
def
test_fetch_image_base64
(
url_images
:
Dict
[
str
,
Image
.
Image
],
image_url
:
str
,
suffix
:
str
):
url_image
=
url_images
[
image_url
]
try
:
mime_type
=
Image
.
MIME
[
Image
.
registered_extensions
()[
suffix
]]
except
KeyError
:
try
:
mime_type
=
mimetypes
.
types_map
[
suffix
]
except
KeyError
:
pytest
.
skip
(
'No MIME type'
)
with
NamedTemporaryFile
(
suffix
=
suffix
)
as
f
:
try
:
url_image
.
save
(
f
.
name
)
except
Exception
as
e
:
if
e
.
args
[
0
]
==
'cannot write mode RGBA as JPEG'
:
pytest
.
skip
(
'Conversion not supported'
)
raise
base64_image
=
base64
.
b64encode
(
f
.
read
()).
decode
(
"utf-8"
)
data_url
=
f
"data:
{
mime_type
}
;base64,
{
base64_image
}
"
data_image
=
await
ImageFetchAiohttp
.
fetch_image
(
data_url
)
if
_image_equals
(
url_image
,
Image
.
open
(
f
)):
assert
_image_equals
(
url_image
,
data_image
)
else
:
pass
# Lossy format; only check that image can be opened
tests/quantization/test_bitsandbytes.py
0 → 100644
View file @
f48954a4
'''Tests whether bitsandbytes computation is enabled correctly.
Run `pytest tests/quantization/test_bitsandbytes.py`.
'''
import
pytest
import
torch
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
'bitsandbytes'
].
get_min_capability
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
def
test_load_bnb_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
'huggyllama/llama-7b'
,
quantization
=
'bitsandbytes'
,
load_format
=
'bitsandbytes'
,
enforce_eager
=
True
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
# check the weights in MLP & SelfAttention are quantized to torch.uint8
qweight
=
model
.
model
.
layers
[
0
].
mlp
.
gate_up_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected gate_up_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
qweight
=
model
.
model
.
layers
[
0
].
mlp
.
down_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected down_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
qweight
=
model
.
model
.
layers
[
0
].
self_attn
.
o_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected o_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
qweight
=
model
.
model
.
layers
[
0
].
self_attn
.
qkv_proj
.
qweight
assert
qweight
.
dtype
==
torch
.
uint8
,
(
f
'Expected qkv_proj dtype torch.uint8 but got
{
qweight
.
dtype
}
'
)
# some weights should not be quantized
weight
=
model
.
lm_head
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'lm_head weight dtype should not be torch.uint8'
)
weight
=
model
.
model
.
embed_tokens
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'embed_tokens weight dtype should not be torch.uint8'
)
weight
=
model
.
model
.
layers
[
0
].
input_layernorm
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'input_layernorm weight dtype should not be torch.uint8'
)
weight
=
model
.
model
.
layers
[
0
].
post_attention_layernorm
.
weight
assert
weight
.
dtype
!=
torch
.
uint8
,
(
'input_layernorm weight dtype should not be torch.uint8'
)
# check the output of the model is expected
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
logprobs
=
1
,
prompt_logprobs
=
1
,
max_tokens
=
8
)
prompts
=
[
'That which does not kill us'
,
'To be or not to be,'
]
expected_outputs
=
[
'That which does not kill us makes us stronger.'
,
'To be or not to be, that is the question.'
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
assert
len
(
outputs
)
==
len
(
prompts
)
for
index
in
range
(
len
(
outputs
)):
# compare the first line of the output
actual_output
=
outputs
[
index
][
1
][
0
].
split
(
'
\n
'
,
1
)[
0
]
expected_output
=
expected_outputs
[
index
].
split
(
'
\n
'
,
1
)[
0
]
assert
actual_output
==
expected_output
,
(
f
'Expected:
{
expected_output
}
, but got:
{
actual_output
}
'
)
tests/quantization/test_compressed_tensors.py
View file @
f48954a4
...
@@ -5,32 +5,58 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
...
@@ -5,32 +5,58 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
import
torch
import
torch
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW8A8StaticTensor
)
CompressedTensorsLinearMethod
,
CompressedTensorsW8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
)
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
):
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
):
model_path
=
"nm-testing/tinyllama-one-shot-static-quant-test-compressed"
model_path
=
"nm-testing/tinyllama-oneshot-w8a8-static-v2"
llm
=
vllm_runner
(
model_path
,
quantization
=
"sparseml"
,
enforce_eager
=
True
)
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
qkv_proj
=
layer
.
self_attn
.
qkv_proj
o_proj
=
layer
.
self_attn
.
o_proj
o_proj
=
layer
.
self_attn
.
o_proj
gate_up_proj
=
layer
.
mlp
.
gate_up_proj
gate_up_proj
=
layer
.
mlp
.
gate_up_proj
down_proj
=
layer
.
mlp
.
down_proj
down_proj
=
layer
.
mlp
.
down_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
o_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
o_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
gate_up_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
gate_up_proj
.
quant_method
,
assert
isinstance
(
down_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
CompressedTensorsLinearMethod
)
assert
isinstance
(
down_proj
.
quant_method
,
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8StaticTensor
)
CompressedTensorsLinearMethod
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8StaticTensor
)
assert
o_proj
.
weight
.
dtype
is
torch
.
int8
assert
gate_up_proj
.
weight
.
dtype
is
torch
.
int8
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
assert
o_proj
.
weight
.
dtype
is
torch
.
int8
assert
qkv_proj
.
weight_scale
.
shard_splitter
is
not
None
assert
gate_up_proj
.
weight
.
dtype
is
torch
.
int8
assert
qkv_proj
.
weight_scale
.
logical_widths
is
not
None
assert
qkv_proj
.
input_scale
.
dtype
is
torch
.
float32
assert
qkv_proj
.
weight_scale
.
shard_splitter
is
not
None
assert
qkv_proj
.
weight_scale
.
logical_widths
is
not
None
assert
qkv_proj
.
input_scale
.
dtype
is
torch
.
float32
def
test_compressed_tensors_no_enforce_eager
(
vllm_runner
):
model_path
=
"nm-testing/tinyllama-oneshot-w8a8-static-v2"
with
vllm_runner
(
model_path
)
as
llm
:
sampling_params
=
SamplingParams
()
output
=
llm
.
generate
(
"Hello world!"
,
sampling_params
=
sampling_params
)
assert
output
def
test_compressed_tensors_w8a8_dynanmic_per_token
(
vllm_runner
):
model_path
=
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
with
vllm_runner
(
model_path
,
enforce_eager
=
True
,
dtype
=
torch
.
float16
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8DynamicToken
)
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
tests/quantization/test_fp8.py
View file @
f48954a4
...
@@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
...
@@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
reason
=
"FP8 is not supported on this GPU type."
)
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
llm
=
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
Prev
1
…
3
4
5
6
7
8
9
10
11
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment