Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e661d594
Commit
e661d594
authored
Aug 12, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.4' into v0.5.4-dtk24.04.1
parents
6b16ea2e
4db5176d
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1221 additions
and
142 deletions
+1221
-142
tests/lora/utils.py
tests/lora/utils.py
+148
-0
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+91
-0
tests/models/test_blip2.py
tests/models/test_blip2.py
+102
-0
tests/models/test_compressed_tensors.py
tests/models/test_compressed_tensors.py
+0
-52
tests/models/test_danube3_4b.py
tests/models/test_danube3_4b.py
+52
-0
tests/models/test_fuyu.py
tests/models/test_fuyu.py
+4
-4
tests/models/test_internvl.py
tests/models/test_internvl.py
+201
-0
tests/models/test_llava_next.py
tests/models/test_llava_next.py
+129
-45
tests/models/test_minicpmv.py
tests/models/test_minicpmv.py
+283
-0
tests/models/test_phi3v.py
tests/models/test_phi3v.py
+4
-4
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+56
-0
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+14
-4
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+1
-1
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+12
-3
tests/samplers/test_logprobs.py
tests/samplers/test_logprobs.py
+36
-2
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+8
-15
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+4
-1
tests/spec_decode/e2e/conftest.py
tests/spec_decode/e2e/conftest.py
+2
-1
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+56
-4
tests/spec_decode/e2e/test_seed.py
tests/spec_decode/e2e/test_seed.py
+18
-6
No files found.
tests/lora/utils.py
View file @
e661d594
...
...
@@ -86,3 +86,151 @@ class DummyLoRAManager:
packed_lora
=
PackedLoRALayerWeights
.
pack
(
base_loras
)
self
.
set_module_lora
(
module_name
,
packed_lora
)
return
packed_lora
def
assert_close
(
a
,
b
):
rtol
,
atol
=
{
torch
.
float16
:
(
6e-2
,
6e-2
),
torch
.
bfloat16
:
(
6e-2
,
6e-2
),
torch
.
float32
:
(
1e-2
,
1e-2
),
}[
a
.
dtype
]
torch
.
testing
.
assert_close
(
a
,
b
,
rtol
=
rtol
,
atol
=
atol
)
def
ref_torch_groupgemm
(
out_tensor
,
inputs
,
lora_weights
,
lora_indices_tensor
,
seq_len_tensor
,
batches
,
scaling
,
op_type
,
)
->
torch
.
Tensor
:
out_list
=
[]
current_offset
=
0
for
lora_index
,
b_length
in
zip
(
range
(
batches
),
seq_len_tensor
):
input_weight
=
inputs
[
current_offset
:
b_length
+
current_offset
,
:]
current_offset
+=
b_length
lora_weight
=
lora_weights
[
lora_indices_tensor
[
lora_index
]]
result
=
torch
.
nn
.
functional
.
linear
(
input_weight
,
lora_weight
)
result
*=
scaling
out_list
.
append
(
result
)
cat_result
=
torch
.
cat
(
out_list
,
dim
=
0
)
if
op_type
==
"expand"
:
out_tensor
+=
cat_result
else
:
out_tensor
.
copy_
(
cat_result
)
return
def
generate_data
(
batches
,
hidden_size
,
lora_nums
,
max_rank
,
seq_length
,
dtype
,
op_type
,
device
):
seq_len_tensor
=
torch
.
randint
(
seq_length
,
seq_length
+
1
,
(
batches
,
)).
to
(
device
)
b_seq_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
seq_len_tensor
[:
-
1
].
tolist
(),
dtype
=
torch
.
long
),
dim
=
0
,
).
to
(
device
)
total_tokens
=
seq_len_tensor
.
sum
()
if
op_type
==
"shrink"
:
inputs_tensor
=
torch
.
rand
((
total_tokens
,
hidden_size
),
dtype
=
dtype
).
to
(
device
)
lora_weights
=
torch
.
rand
(
(
lora_nums
,
max_rank
,
hidden_size
),
# col-major
dtype
=
dtype
,
).
to
(
device
)
# shrink op need atomic_add, so output is initinized by 0
ref_out_tensor
=
torch
.
zeros
((
total_tokens
,
max_rank
),
dtype
=
dtype
,
device
=
inputs_tensor
.
device
)
# NOTE shrink kernel using torch.float32 as output type
our_out_tensor
=
torch
.
zeros
((
total_tokens
,
max_rank
),
dtype
=
torch
.
float32
).
to
(
device
)
else
:
inputs_tensor
=
torch
.
rand
(
(
total_tokens
,
max_rank
),
dtype
=
dtype
,
).
to
(
device
)
lora_weights
=
torch
.
rand
(
(
lora_nums
,
hidden_size
,
max_rank
),
# col-major
dtype
=
dtype
,
).
to
(
device
)
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
ref_out_tensor
=
torch
.
rand
(
(
total_tokens
,
hidden_size
),
dtype
=
dtype
,
).
to
(
device
)
# Ensure the same input.
our_out_tensor
=
ref_out_tensor
.
clone
()
lora_indices_tensor
=
torch
.
randint
(
0
,
lora_nums
-
1
if
lora_nums
>
1
else
1
,
(
batches
,
)).
to
(
device
)
indices
=
torch
.
zeros
((
total_tokens
),
dtype
=
torch
.
long
).
to
(
device
)
current_offset
=
0
for
b_id
in
range
(
batches
):
lora_index
=
lora_indices_tensor
[
b_id
]
indices
[
current_offset
:
current_offset
+
seq_len_tensor
[
b_id
]].
copy_
(
lora_index
)
current_offset
+=
seq_len_tensor
[
b_id
].
item
()
return
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
ref_out_tensor
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
def
generate_data_for_expand_nslices
(
batches
,
hidden_size
,
lora_nums
,
max_rank
,
seq_length
,
dtype
,
nslices
,
device
):
seq_len_tensor
=
torch
.
randint
(
seq_length
,
seq_length
+
1
,
(
batches
,
)).
to
(
device
)
b_seq_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
seq_len_tensor
[:
-
1
].
tolist
(),
dtype
=
torch
.
long
),
dim
=
0
,
).
to
(
device
)
total_tokens
=
seq_len_tensor
.
sum
()
inputs_tensor
=
torch
.
rand
(
(
total_tokens
,
max_rank
),
dtype
=
dtype
,
).
to
(
device
)
lora_weights_lst
=
[]
for
_
in
range
(
nslices
):
lora_weights_lst
.
append
(
torch
.
rand
(
(
lora_nums
,
hidden_size
,
max_rank
),
# col-major
dtype
=
dtype
,
).
to
(
device
))
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
ref_out_tensor
=
torch
.
rand
((
total_tokens
,
hidden_size
*
nslices
),
dtype
=
dtype
).
to
(
device
)
# Ensure the same input.
our_out_tensor
=
ref_out_tensor
.
clone
()
lora_indices_tensor
=
torch
.
randint
(
0
,
lora_nums
-
1
if
lora_nums
>
1
else
1
,
(
batches
,
))
indices
=
torch
.
zeros
((
total_tokens
),
dtype
=
torch
.
long
).
to
(
device
)
current_offset
=
0
for
b_id
in
range
(
batches
):
lora_index
=
lora_indices_tensor
[
b_id
]
indices
[
current_offset
:
current_offset
+
seq_len_tensor
[
b_id
]]
=
lora_index
.
item
()
current_offset
+=
seq_len_tensor
[
b_id
].
item
()
lora_indices_tensor
=
lora_indices_tensor
.
to
(
device
)
return
(
inputs_tensor
,
lora_weights_lst
,
our_out_tensor
,
ref_out_tensor
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
tests/metrics/test_metrics.py
View file @
e661d594
import
time
from
typing
import
List
import
pytest
...
...
@@ -10,6 +11,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
from
vllm.engine.metrics
import
RayPrometheusStatLogger
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
cleanup
MODELS
=
[
"facebook/opt-125m"
,
]
...
...
@@ -219,6 +222,94 @@ def test_metric_spec_decode(
"does not meet expectation"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"log_interval"
,
[
1
,
3
,
5
,
7
])
def
test_metric_spec_decode_interval
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
log_interval
:
int
,
)
->
None
:
k
=
5
engine_args
=
EngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
,
speculative_model
=
model
,
num_speculative_tokens
=
k
,
use_v2_block_manager
=
True
,
enforce_eager
=
True
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
try
:
engine
.
add_request
(
"request-id-0"
,
example_prompts
[
0
],
SamplingParams
(
max_tokens
=
max_tokens
),
)
# set log internal
stat_logger
=
engine
.
stat_loggers
[
'prometheus'
]
stat_logger
.
local_interval
=
log_interval
# prefill
engine
.
step
()
# wait for 5 seconds to ensure that spec decode metrics
# get triggered in first decode step
time
.
sleep
(
5
)
# first decode step should trigger async collection of metrics
engine
.
step
()
# wait one second to allow H2D transfer to finish
time
.
sleep
(
1
)
# second decode step should now be able to collect the spec
# decode stats and the request should also be finished
engine
.
step
()
# must have finisehd now
assert
not
engine
.
has_unfinished_requests
()
# wait to ensure logging occurs
time
.
sleep
(
log_interval
)
# force logging
engine
.
step
()
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn
=
{
"gauge_spec_decode_draft_acceptance_rate"
:
lambda
v
:
0
<=
v
<=
1
,
"gauge_spec_decode_efficiency"
:
lambda
v
:
0
<=
v
<=
1
,
"counter_spec_decode_num_accepted_tokens"
:
lambda
v
:
0
<=
v
<=
k
,
"counter_spec_decode_num_draft_tokens"
:
lambda
v
:
v
==
k
,
"counter_spec_decode_num_emitted_tokens"
:
lambda
v
:
0
<=
v
<=
k
+
1
,
}
for
metric_name
,
is_expected
in
metric_name_to_expected_fn
.
items
():
metric_val
=
getattr
(
stat_logger
.
metrics
,
metric_name
).
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
assert
is_expected
(
metric_val
),
(
f
"the value of metric
{
metric_name
}
(
{
metric_val
}
) "
"does not meet expectation"
)
finally
:
del
engine
cleanup
()
def
assert_metrics
(
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
num_requests
:
int
)
->
None
:
if
disable_log_stats
:
...
...
tests/models/test_blip2.py
0 → 100644
View file @
e661d594
from
typing
import
List
,
Optional
,
Tuple
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
IMAGE_ASSETS
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"Question: What's the content of the image? Answer:"
,
"cherry_blossom"
:
"Question: What is the season? Answer:"
,
})
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
_
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
+
"
\n
"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
hf_output_ids
=
tokenizer
.
encode
(
hf_output_str
)
assert
hf_output_ids
[
0
]
==
tokenizer
.
bos_token_id
hf_output_ids
=
hf_output_ids
[
1
:]
return
hf_output_ids
,
hf_output_str
,
out_logprobs
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"Salesforce/blip2-opt-2.7b"
])
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs_per_image
]
with
hf_runner
(
model
,
dtype
=
dtype
,
is_vision_model
=
True
)
as
hf_model
:
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs_per_image
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/models/test_compressed_tensors.py
deleted
100644 → 0
View file @
6b16ea2e
"""Compares vllm vs sparseml for compressed-tensors
Note: vllm and sparseml do not have bitwise correctness,
so in this test, we just confirm that the top selected
tokens of the are in the top 5 selections of each other.
"""
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
.utils
import
check_logprobs_close
MODELS
=
[
# No bias
"nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
,
# Bias
"neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
]
MAX_TOKENS
=
32
NUM_LOGPROBS
=
5
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"compressed-tensors"
),
reason
=
"compressed-tensors is not supported on this machine type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
vllm_runner
,
hf_runner
,
example_prompts
,
model_name
,
)
->
None
:
# Run sparseml.
with
hf_runner
(
model_name
=
model_name
,
is_sparseml_model
=
True
)
as
sparseml_model
:
sparseml_outputs
=
sparseml_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
MAX_TOKENS
,
NUM_LOGPROBS
)
# Run vllm.
with
vllm_runner
(
model_name
=
model_name
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
MAX_TOKENS
,
NUM_LOGPROBS
)
check_logprobs_close
(
outputs_0_lst
=
sparseml_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"sparseml"
,
name_1
=
"vllm"
,
)
tests/models/test_danube3_4b.py
0 → 100644
View file @
e661d594
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests danube3 separately because its head size isn't supported on CPU yet.
Run `pytest tests/models/test_danube3_4b.py`.
"""
import
pytest
from
.utils
import
check_outputs_equal
MODELS
=
[
"h2oai/h2o-danube3-4b-base"
]
target_dtype
=
"half"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
def
test_model_print
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
tests/models/test_fuyu.py
View file @
e661d594
...
...
@@ -77,8 +77,8 @@ def run_test(
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
vllm_
images
)
for
prompts
,
vllm_
images
in
inputs_per_image
images
=
images
)
for
prompts
,
images
in
inputs_per_image
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
...
...
@@ -89,9 +89,9 @@ def run_test(
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
hf_
images
,
images
=
images
,
eos_token_id
=
eos_token_id
)
for
prompts
,
hf_
images
in
inputs_per_image
for
prompts
,
images
in
inputs_per_image
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
...
...
tests/models/test_internvl.py
0 → 100644
View file @
e661d594
import
types
from
typing
import
List
,
Optional
,
Type
import
pytest
import
torch
from
huggingface_hub
import
snapshot_download
from
PIL.Image
import
Image
from
vllm.model_executor.models.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values
)
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.utils
import
is_cpu
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|im_start|>User
\n
<image>
\n
What's the content in the center of the image?<|im_end|>
\n
<|im_start|>Assistant
\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|im_start|>User
\n
<image>
\n
What is the season?<|im_end|>
\n
<|im_start|>Assistant
\n
"
,
# noqa: E501
})
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
models
=
[
snapshot_download
(
"OpenGVLab/InternVL2-1B"
),
snapshot_download
(
"OpenGVLab/InternVL2-2B"
),
# snapshot_download("OpenGVLab/InternVL2-4B"), # broken
]
class
InternVLProcessor
:
"""A simple processor for InternVL2 HF model which misses a processor."""
def
__init__
(
self
,
hf_runner
:
HfRunner
):
self
.
num_image_token
=
hf_runner
.
model
.
num_image_token
self
.
tokenizer
=
hf_runner
.
tokenizer
self
.
dtype
=
hf_runner
.
model
.
dtype
def
__call__
(
self
,
text
:
str
,
images
:
Image
,
**
kwargs
):
pixel_values
=
image_to_pixel_values
(
images
).
to
(
self
.
dtype
)
num_patches_list
=
[
pixel_values
.
shape
[
0
]]
for
num_patches
in
num_patches_list
:
context_tokens
=
IMG_CONTEXT
*
self
.
num_image_token
*
num_patches
image_tokens
=
IMG_START
+
context_tokens
+
IMG_END
text
=
text
.
replace
(
'<image>'
,
image_tokens
,
1
)
prompt
=
self
.
tokenizer
(
text
,
return_tensors
=
"pt"
)
prompt
.
update
({
"pixel_values"
:
pixel_values
})
return
prompt
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
def
generate
(
self
,
pixel_values
:
torch
.
FloatTensor
,
input_ids
:
torch
.
FloatTensor
,
attention_mask
:
Optional
[
torch
.
LongTensor
]
=
None
,
**
generate_kwargs
,
)
->
torch
.
LongTensor
:
"""Generate method for InternVL2 model without fixed use_cache."""
assert
self
.
img_context_token_id
is
not
None
vit_embeds
=
self
.
extract_feature
(
pixel_values
)
input_embeds
=
self
.
language_model
.
get_input_embeddings
()(
input_ids
)
B
,
N
,
C
=
input_embeds
.
shape
input_embeds
=
input_embeds
.
reshape
(
B
*
N
,
C
)
input_ids
=
input_ids
.
reshape
(
B
*
N
)
selected
=
(
input_ids
==
self
.
img_context_token_id
)
assert
selected
.
sum
()
!=
0
input_embeds
[
selected
]
=
vit_embeds
.
reshape
(
-
1
,
C
).
to
(
input_embeds
.
device
)
input_embeds
=
input_embeds
.
reshape
(
B
,
N
,
C
)
outputs
=
self
.
language_model
.
generate
(
inputs_embeds
=
input_embeds
,
attention_mask
=
attention_mask
,
**
generate_kwargs
,
)
return
outputs
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
max_model_len
=
4096
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs_per_image
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
img_context_token_id
=
hf_model
.
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
hf_model
.
model
.
img_context_token_id
=
img_context_token_id
hf_model
.
processor
=
InternVLProcessor
(
hf_model
)
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
language_model
.
get_output_embeddings
()
hf_model
.
model
.
generate
=
types
.
MethodType
(
generate
,
hf_model
.
model
)
eos_token_id
=
hf_model
.
tokenizer
.
eos_token_id
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
hf_images
,
eos_token_id
=
eos_token_id
)
for
prompts
,
hf_images
in
inputs_per_image
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
target_dtype
=
"half"
if
is_cpu
():
target_dtype
=
"bfloat16"
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
torch
.
inference_mode
()
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
tests/models/test_llava_next.py
View file @
e661d594
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
import
pytest
from
transformers
import
AutoConfig
,
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.model_executor.models.llava_next
import
(
get_llava_next_image_feature_size
)
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
IMAGE_ASSETS
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
...
...
@@ -27,6 +25,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
IMAGE_TOKEN_ID
=
32000
models
=
[
"llava-hf/llava-v1.6-vicuna-7b-hf"
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
...
...
@@ -50,45 +50,75 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
return
hf_output_ids
,
hf_output_str
,
out_logprobs
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"llava-hf/llava-v1.6-vicuna-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
@
overload
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
...
@
overload
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
sizes
:
List
[
Tuple
[
int
,
int
]],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
...
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
Optional
[
List
[
float
]]
=
None
,
sizes
:
Optional
[
List
[
Tuple
[
int
,
int
]]]
=
None
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
if
size_factors
is
not
None
:
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
elif
sizes
is
not
None
:
inputs_per_image
=
[(
[
prompt
for
_
in
sizes
],
[
image
.
resize
(
size
)
for
size
in
sizes
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
else
:
raise
ValueError
(
"You must provide either `size_factors` or `sizes`"
)
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
4096
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
...
...
@@ -122,11 +152,65 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
)
@
pytest
.
mark
.
parametrize
(
"height_and_width_and_result"
,
[(
1669
,
2560
,
2144
),
(
183
,
488
,
776
)])
def
test_image_feature_size
(
height_and_width_and_result
):
height
,
width
,
result
=
height_and_width_and_result
config
=
AutoConfig
.
from_pretrained
(
"llava-hf/llava-v1.6-mistral-7b-hf"
)
assert
get_llava_next_image_feature_size
(
config
,
input_height
=
height
,
input_width
=
width
)
==
result
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"sizes"
,
[[(
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
)]],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_fixed_sizes
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
sizes
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
sizes
=
sizes
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
tests/models/test_minicpmv.py
0 → 100644
View file @
e661d594
from
collections
import
UserDict
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
pytest
import
torch
import
torch.types
from
transformers
import
BatchFeature
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
class
NestedInputs
(
UserDict
):
def
__init__
(
self
,
model_inputs
:
BatchFeature
):
super
().
__init__
({
"model_inputs"
:
model_inputs
})
self
.
model_inputs
=
model_inputs
def
to
(
self
,
device
:
torch
.
types
.
Device
):
return
NestedInputs
(
self
.
model_inputs
.
to
(
device
))
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
What's the content of the image?<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
What is the season?<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
})
models
=
[
"openbmb/MiniCPM-Llama3-V-2_5"
]
def
trunc_hf_output
(
hf_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]):
output_ids
,
output_str
,
out_logprobs
=
hf_output
if
output_str
.
endswith
(
"<|eot_id|>"
):
output_str
=
output_str
.
split
(
"<|eot_id|>"
)[
0
]
return
output_ids
,
output_str
,
out_logprobs
target_dtype
=
"half"
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
max_model_len
=
4096
,
max_num_seqs
=
1
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stop_token_ids
=
[
tokenizer
.
eos_id
,
tokenizer
.
eot_id
]
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
stop_token_ids
=
stop_token_ids
)
for
prompts
,
images
in
inputs_per_image
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
,
torch
.
no_grad
():
hf_processor
=
hf_model
.
processor
hf_model
.
processor
=
lambda
**
kw
:
NestedInputs
(
hf_processor
(
**
kw
)
# type: ignore
)
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
tokenizer
=
tokenizer
)
for
prompts
,
images
in
inputs_per_image
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
check_logprobs_close
(
outputs_0_lst
=
[
trunc_hf_output
(
hf_output
)
for
hf_output
in
hf_outputs
],
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
HF_MULTIIMAGE_IMAGE_PROMPT
=
\
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
(<image>./</image>)
\n
"
\
"Describe these images.<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
def
run_multi_image_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
max_model_len
=
4096
,
max_num_seqs
=
1
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stop_token_ids
=
[
tokenizer
.
eos_id
,
tokenizer
.
eot_id
]
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
stop_token_ids
=
stop_token_ids
)
for
prompts
,
images
in
inputs_per_case
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
,
torch
.
no_grad
():
hf_processor
=
hf_model
.
processor
hf_model
.
processor
=
lambda
**
kw
:
NestedInputs
(
hf_processor
(
**
kw
)
# type: ignore
)
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
tokenizer
=
tokenizer
)
for
prompts
,
images
in
inputs_per_case
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
[
trunc_hf_output
(
hf_output
)
for
hf_output
in
hf_outputs
],
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
run_multi_image_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
tests/models/test_phi3v.py
View file @
e661d594
...
...
@@ -101,8 +101,8 @@ def run_test(
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
vllm_
images
)
for
prompts
,
vllm_
images
in
inputs_per_image
images
=
images
)
for
prompts
,
images
in
inputs_per_image
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
...
...
@@ -114,9 +114,9 @@ def run_test(
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
hf_
images
,
images
=
images
,
eos_token_id
=
eos_token_id
)
for
prompts
,
hf_
images
in
inputs_per_image
for
prompts
,
images
in
inputs_per_image
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
...
...
tests/prefix_caching/test_prefix_caching.py
View file @
e661d594
...
...
@@ -6,10 +6,17 @@ from typing import List
import
pytest
from
tests.kernels.utils
import
override_backend_env_variable
from
vllm.block
import
PhysicalTokenBlock
from
vllm.core.block_manager_v1
import
CachedBlockAllocator
from
vllm.utils
import
Device
from
..models.utils
import
check_outputs_equal
MODELS
=
[
"facebook/opt-125m"
,
]
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
16
])
...
...
@@ -76,3 +83,52 @@ def test_eviction(num_blocks: int, ):
assert
(
realloc_block
!=
new_block
)
assert
(
new_block
.
block_hash
==
new_block_hash
)
assert
(
new_block
.
block_number
==
2
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"cached_position"
,
[
0
,
1
])
@
pytest
.
mark
.
parametrize
(
"use_v2_block_manager"
,
[
False
,
True
])
def
test_mixed_requests
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
backend
:
str
,
dtype
:
str
,
max_tokens
:
int
,
cached_position
:
int
,
use_v2_block_manager
:
bool
,
monkeypatch
,
)
->
None
:
"""
Test the case when some sequences have the prefix cache hit
and the others don't. The cached position determines where
the sequence is at among the batch of prefills.
"""
override_backend_env_variable
(
monkeypatch
,
backend
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
cached_prompt
=
example_prompts
[
cached_position
]
with
vllm_runner
(
model
,
dtype
=
dtype
,
enable_prefix_caching
=
True
,
use_v2_block_manager
=
use_v2_block_manager
,
)
as
vllm_model
:
# Run the first prompt so the cache is populated
vllm_outputs
=
vllm_model
.
generate_greedy
([
cached_prompt
],
max_tokens
)
# Run all the promopts
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/quantization/test_bitsandbytes.py
View file @
e661d594
...
...
@@ -8,15 +8,20 @@ import torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
SamplingParams
models_to_test
=
[
(
'huggyllama/llama-7b'
,
'quantize model inflight'
),
(
'lllyasviel/omost-llama-3-8b-4bits'
,
'read pre-quantized model'
),
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
def
test_load_bnb_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
'huggyllama/llama-7b'
,
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_to_test
)
def
test_load_bnb_model
(
vllm_runner
,
model_name
,
description
)
->
None
:
with
vllm_runner
(
model_name
,
quantization
=
'bitsandbytes'
,
load_format
=
'bitsandbytes'
,
enforce_eager
=
True
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
# check the weights in MLP & SelfAttention are quantized to torch.uint8
...
...
@@ -65,12 +70,17 @@ def test_load_bnb_model(vllm_runner) -> None:
'To be or not to be, that is the question.'
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
assert
len
(
outputs
)
==
len
(
prompts
)
for
index
in
range
(
len
(
outputs
)):
# compare the first line of the output
actual_output
=
outputs
[
index
][
1
][
0
].
split
(
'
\n
'
,
1
)[
0
]
expected_output
=
expected_outputs
[
index
].
split
(
'
\n
'
,
1
)[
0
]
assert
len
(
actual_output
)
>=
len
(
expected_output
),
(
f
'Actual
{
actual_output
}
should be larger than or equal to '
f
'expected
{
expected_output
}
'
)
actual_output
=
actual_output
[:
len
(
expected_output
)]
assert
actual_output
==
expected_output
,
(
f
'Expected:
{
expected_output
}
, but got:
{
actual_output
}
'
)
tests/quantization/test_compressed_tensors.py
View file @
e661d594
"""Test model set-up and weight loading for
sparseml
-quantized models.
"""Test model set-up and weight loading for
llmcompressor
-quantized models.
Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
...
...
tests/quantization/test_fp8.py
View file @
e661d594
...
...
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
MODELS
=
[
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Phi-3-mini-128k-instruct-FP8"
,
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"
,
]
...
...
@@ -59,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
def
test_load_fp16_model
(
vllm_runner
,
kv_cache_dtype
:
str
)
->
None
:
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
,
kv_cache_dtype
=
kv_cache_dtype
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
if
kv_cache_dtype
==
"fp8"
:
attn
=
model
.
model
.
decoder
.
layers
[
0
].
self_attn
.
attn
assert
isinstance
(
attn
.
quant_method
,
Fp8KVCacheMethod
)
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
...
...
@@ -114,7 +123,7 @@ def test_scaled_fp8_quant(dtype) -> None:
assert
torch
.
allclose
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
# Padding
y
,
_
=
ops
.
scaled_fp8_quant
(
x
,
inv_scale
,
batch_dim
_padding
=
17
)
y
,
_
=
ops
.
scaled_fp8_quant
(
x
,
inv_scale
,
num_token
_padding
=
17
)
assert
y
.
shape
[
0
]
==
17
assert
torch
.
allclose
(
ref_y
,
...
...
tests/samplers/test_logprobs.py
View file @
e661d594
...
...
@@ -14,7 +14,7 @@ MODELS = ["facebook/opt-125m"]
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
# needed for comparing logprobs with HF
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
,
-
1
])
@
pytest
.
mark
.
parametrize
(
"num_top_logprobs"
,
[
6
])
# 32000 == vocab_size
@
pytest
.
mark
.
parametrize
(
"num_top_logprobs"
,
[
0
,
6
])
# 32000 == vocab_size
@
pytest
.
mark
.
parametrize
(
"detokenize"
,
[
True
,
False
])
def
test_get_prompt_logprobs
(
hf_runner
,
...
...
@@ -63,7 +63,10 @@ def test_get_prompt_logprobs(
assert
result
.
outputs
[
0
].
logprobs
is
not
None
assert
len
(
result
.
outputs
[
0
].
logprobs
)
==
max_tokens
for
logprobs
in
result
.
outputs
[
0
].
logprobs
:
assert
len
(
logprobs
)
==
num_top_logprobs
# If the output token is not included in the top X
# logprob, it can return 1 more data
assert
(
len
(
logprobs
)
==
num_top_logprobs
or
len
(
logprobs
)
==
num_top_logprobs
+
1
)
output_text
=
result
.
outputs
[
0
].
text
output_string_from_most_likely_tokens_lst
:
List
[
str
]
=
[]
for
top_logprobs
in
result
.
outputs
[
0
].
logprobs
:
...
...
@@ -136,3 +139,34 @@ def test_max_logprobs():
with
pytest
.
raises
(
ValueError
):
runner
.
generate
([
"Hello world"
],
sampling_params
=
bad_sampling_params
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
,
-
1
])
@
pytest
.
mark
.
parametrize
(
"detokenize"
,
[
True
,
False
])
def
test_none_logprobs
(
vllm_runner
,
model
,
chunked_prefill_token_size
:
int
,
detokenize
:
bool
,
example_prompts
):
max_num_seqs
=
256
enable_chunked_prefill
=
False
max_num_batched_tokens
=
None
if
chunked_prefill_token_size
!=
-
1
:
enable_chunked_prefill
=
True
max_num_seqs
=
min
(
chunked_prefill_token_size
,
max_num_seqs
)
max_num_batched_tokens
=
chunked_prefill_token_size
max_tokens
=
5
with
vllm_runner
(
model
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_seqs
=
max_num_seqs
,
)
as
vllm_model
:
sampling_params_logprobs_none
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
None
,
temperature
=
0.0
,
detokenize
=
detokenize
)
results_logprobs_none
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_none
)
for
i
in
range
(
len
(
results_logprobs_none
)):
assert
results_logprobs_none
[
i
].
outputs
[
0
].
logprobs
is
None
assert
results_logprobs_none
[
i
].
outputs
[
0
].
cumulative_logprob
is
None
tests/samplers/test_rejection_sampler.py
View file @
e661d594
...
...
@@ -150,10 +150,9 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
high
=
vocab_size
,
size
=
(
batch_size
,
k
),
dtype
=
torch
.
int64
)
generators
=
[
None
]
*
batch_size
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
generators
)
draft_token_ids
)
@
pytest
.
mark
.
parametrize
(
"frac_seeded"
,
[
0.0
,
0.25
,
0.5
,
1.0
])
...
...
@@ -185,14 +184,13 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
results
=
[]
for
_
in
range
(
n_rep
):
generators
=
[
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
if
seeded_mask
[
i
]
else
None
for
i
in
range
(
batch_size
)
]
seeded_seqs
=
{
i
:
torch
.
Generator
(
device
=
device
).
manual_seed
(
i
)
for
i
in
range
(
batch_size
)
if
seeded_mask
[
i
]
}
results
.
append
(
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
generator
s
))
draft_token_ids
,
seeded_seq
s
))
for
i
in
range
(
batch_size
):
if
seeded_mask
[
i
]:
...
...
@@ -242,11 +240,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
raise
AssertionError
()
oob_token_ids
[
0
][
0
]
=
rogue_token_id
generators
=
[
None
]
*
batch_size
with
pytest
.
raises
(
AssertionError
):
rejection_sampler
(
target_probs
,
bonus_token_ids
,
draft_probs
,
draft_token_ids
,
generators
)
draft_token_ids
)
@
pytest
.
mark
.
parametrize
(
"draft_and_target_probs_equal"
,
[
True
,
False
])
...
...
@@ -417,15 +414,11 @@ class _CorrectnessTestHelper:
dtype
=
torch
.
int64
,
device
=
"cuda"
).
repeat
(
num_samples
,
1
)
# unseeded
generators
=
[
None
]
# Get output tokens via rejection sampling.
output_token_ids
=
self
.
rejection_sampler
(
target_probs
.
to
(
"cuda"
),
bonus_token_ids
.
to
(
"cuda"
),
draft_probs
.
to
(
"cuda"
),
draft_token_ids
.
to
(
"cuda"
),
generators
)
draft_token_ids
.
to
(
"cuda"
))
# Remove bonus tokens
output_token_ids
=
output_token_ids
[:,
:
-
1
].
flatten
()
...
...
tests/samplers/test_sampler.py
View file @
e661d594
...
...
@@ -510,13 +510,16 @@ def test_sampler_mixed(seed: int, device: str):
))
seq_lens
.
append
(
seq_group_metadata_list
[
-
1
].
seq_data
[
0
].
get_len
())
generators
:
Dict
[
str
,
torch
.
Generator
]
=
{}
def
test_sampling
():
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
,
query_lens
=
seq_lens
,
device
=
device
,
pin_memory
=
is_pin_memory_available
())
pin_memory
=
is_pin_memory_available
(),
generators
=
generators
)
sampler_output
=
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
...
...
tests/spec_decode/e2e/conftest.py
View file @
e661d594
...
...
@@ -191,7 +191,8 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
and
llm
.
llm_engine
.
log_stats
):
for
sate_logger
in
llm
.
llm_engine
.
stat_loggers
.
values
():
sate_logger
.
local_interval
=
0
set_random_seed
(
seed
)
if
seed
is
not
None
:
set_random_seed
(
seed
)
yield
llm
del
llm
...
...
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
e661d594
...
...
@@ -21,17 +21,18 @@ correctess for the target model outputs.
import
pytest
from
.conftest
import
run_greedy_equality_correctness_test
from
.conftest
import
(
run_equality_correctness_test
,
run_greedy_equality_correctness_test
)
# main model
MAIN_MODEL
=
"
ibm-granite/granite-3b-code-instruct
"
MAIN_MODEL
=
"
JackFram/llama-160m
"
# speculative model
SPEC_MODEL
=
"ibm-
granite/granite-3b-code-instruct
-accelerator"
SPEC_MODEL
=
"ibm-
fms/llama-160m
-accelerator"
# max. number of speculative tokens: this corresponds to
# n_predict in the config.json of the speculator model.
MAX_SPEC_TOKENS
=
5
MAX_SPEC_TOKENS
=
3
# precision
PRECISION
=
"float32"
...
...
@@ -77,6 +78,57 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
# Required for spec decode.
"use_v2_block_manager"
:
True
,
# Print spec metrics.
"disable_log_stats"
:
False
,
# Precision
"dtype"
:
PRECISION
,
# Main model
"model"
:
MAIN_MODEL
,
# Speculative model
"speculative_model"
:
SPEC_MODEL
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"seed"
:
1
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"seed"
:
5
}])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"temperature"
,
[
0.1
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
None
])
def
test_mlp_e2e_seeded_correctness
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
output_len
:
int
,
temperature
:
float
):
"""Verify seeded runs produce the same output."""
run_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
temperature
=
temperature
,
seeded
=
True
,
force_output_len
=
True
)
# Ensure this same test does fail if we _don't_ include per-request seeds
with
pytest
.
raises
(
AssertionError
):
run_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
temperature
=
temperature
,
seeded
=
False
,
force_output_len
=
True
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
...
...
tests/spec_decode/e2e/test_seed.py
View file @
e661d594
...
...
@@ -21,24 +21,36 @@ from .conftest import run_equality_correctness_test
"num_speculative_tokens"
:
3
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"seed"
:
1
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"seed"
:
5
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
8
,
32
])
@
pytest
.
mark
.
parametrize
(
"temperature"
,
[
0.1
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
# Use smaller output len for fast test.
1
0
,
2
0
,
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_seeded_consistency
(
baseline_llm_generator
,
batch_size
:
int
,
temperature
:
float
,
output_len
:
int
):
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
None
])
def
test_seeded_consistency
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
:
int
,
temperature
:
float
,
output_len
:
int
):
"""Verify outputs are consistent across multiple runs with same seed
"""
run_equality_correctness_test
(
baseline_llm_generator
,
baseline
_llm_generator
,
test
_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
temperature
=
temperature
,
seeded
=
True
,
force_output_len
=
True
)
# Ensure this same test does fail if we _don't_ include per-request seeds
with
pytest
.
raises
(
AssertionError
):
run_equality_correctness_test
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
max_output_len
=
output_len
,
temperature
=
temperature
,
seeded
=
False
,
force_output_len
=
True
)
Prev
1
…
5
6
7
8
9
10
11
12
13
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment