Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0640f227
Commit
0640f227
authored
Sep 09, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.0' into v0.6.0-dev
parents
82f1ffdf
32e7db25
Changes
335
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1705 additions
and
339 deletions
+1705
-339
tests/kernels/test_causal_conv1d.py
tests/kernels/test_causal_conv1d.py
+205
-0
tests/kernels/test_flashinfer.py
tests/kernels/test_flashinfer.py
+222
-6
tests/kernels/test_mamba_ssm.py
tests/kernels/test_mamba_ssm.py
+324
-0
tests/lora/test_gemma.py
tests/lora/test_gemma.py
+7
-2
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+18
-6
tests/models/test_fp8.py
tests/models/test_fp8.py
+81
-100
tests/models/test_granite.py
tests/models/test_granite.py
+49
-0
tests/models/test_intern_vit.py
tests/models/test_intern_vit.py
+1
-2
tests/models/test_internvl.py
tests/models/test_internvl.py
+34
-41
tests/models/test_llava.py
tests/models/test_llava.py
+17
-0
tests/models/test_llava_next.py
tests/models/test_llava_next.py
+80
-13
tests/models/test_minicpmv.py
tests/models/test_minicpmv.py
+34
-102
tests/models/test_mistral.py
tests/models/test_mistral.py
+3
-1
tests/models/test_phi3v.py
tests/models/test_phi3v.py
+88
-21
tests/models/test_phimoe.py
tests/models/test_phimoe.py
+111
-0
tests/models/test_ultravox.py
tests/models/test_ultravox.py
+79
-28
tests/models/utils.py
tests/models/utils.py
+26
-17
tests/multi_step/test_correctness_async_llm.py
tests/multi_step/test_correctness_async_llm.py
+129
-0
tests/multi_step/test_correctness_llm.py
tests/multi_step/test_correctness_llm.py
+102
-0
tests/multimodal/test_base.py
tests/multimodal/test_base.py
+95
-0
No files found.
tests/kernels/test_causal_conv1d.py
0 → 100644
View file @
0640f227
from
typing
import
Optional
import
pytest
import
torch
import
torch.nn.functional
as
F
from
einops
import
rearrange
from
vllm.model_executor.layers.mamba.ops.causal_conv1d
import
(
causal_conv1d_fn
,
causal_conv1d_update
)
def
causal_conv1d_ref
(
x
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
initial_states
:
Optional
[
torch
.
Tensor
]
=
None
,
return_final_states
:
bool
=
False
,
final_states_out
:
Optional
[
torch
.
Tensor
]
=
None
,
activation
:
Optional
[
str
]
=
"silu"
,
):
"""
x: (batch, dim, seqlen)
weight: (dim, width)
bias: (dim,)
initial_states: (batch, dim, width - 1)
final_states_out: (batch, dim, width - 1)
out: (batch, dim, seqlen)
"""
if
activation
not
in
[
None
,
"silu"
,
"swish"
]:
raise
NotImplementedError
(
"activation must be None, silu, or swish"
)
dtype_in
=
x
.
dtype
x
=
x
.
to
(
weight
.
dtype
)
seqlen
=
x
.
shape
[
-
1
]
dim
,
width
=
weight
.
shape
if
initial_states
is
None
:
out
=
F
.
conv1d
(
x
,
weight
.
unsqueeze
(
1
),
bias
,
padding
=
width
-
1
,
groups
=
dim
)
else
:
x
=
torch
.
cat
([
initial_states
,
x
],
dim
=-
1
)
out
=
F
.
conv1d
(
x
,
weight
.
unsqueeze
(
1
),
bias
,
padding
=
0
,
groups
=
dim
)
out
=
out
[...,
:
seqlen
]
if
return_final_states
:
final_states
=
F
.
pad
(
x
,
(
width
-
1
-
x
.
shape
[
-
1
],
0
)).
to
(
dtype_in
)
# (batch, dim, width - 1)
if
final_states_out
is
not
None
:
final_states_out
.
copy_
(
final_states
)
else
:
final_states_out
=
final_states
out
=
(
out
if
activation
is
None
else
F
.
silu
(
out
)).
to
(
dtype
=
dtype_in
)
return
(
out
,
None
)
if
not
return_final_states
else
(
out
,
final_states_out
)
def
causal_conv1d_update_ref
(
x
:
torch
.
Tensor
,
conv_state
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
activation
:
Optional
[
str
]
=
None
):
"""
x: (batch, dim)
conv_state: (batch, dim, width)
weight: (dim, width)
bias: (dim,)
out: (batch, dim)
"""
if
activation
not
in
[
None
,
"silu"
,
"swish"
]:
raise
NotImplementedError
(
"activation must be None, silu, or swish"
)
dtype_in
=
x
.
dtype
batch
,
dim
=
x
.
shape
width
=
weight
.
shape
[
1
]
assert
conv_state
.
shape
==
(
batch
,
dim
,
width
)
assert
weight
.
shape
==
(
dim
,
width
)
conv_state
.
copy_
(
torch
.
roll
(
conv_state
,
shifts
=-
1
,
dims
=-
1
))
# Update state (B D W)
conv_state
[:,
:,
-
1
]
=
x
out
=
torch
.
sum
(
conv_state
*
weight
,
dim
=-
1
)
# (B D)
if
bias
is
not
None
:
out
+=
bias
return
(
out
if
activation
is
None
else
F
.
silu
(
out
)).
to
(
dtype
=
dtype_in
)
@
pytest
.
mark
.
parametrize
(
"return_final_states"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"has_initial_states"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"channel_last"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"itype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"silu_activation"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"has_bias"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"width"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seqlen"
,
[
128
,
512
,
4096
])
@
pytest
.
mark
.
parametrize
(
'dim'
,
[
64
,
4096
+
32
])
@
pytest
.
mark
.
parametrize
(
'batch'
,
[
1
,
2
])
def
test_causal_conv1d
(
batch
,
dim
,
seqlen
,
width
,
has_bias
,
silu_activation
,
itype
,
channel_last
,
has_initial_states
,
return_final_states
):
if
not
channel_last
and
(
has_initial_states
or
return_final_states
):
pytest
.
skip
(
"Only channel_last support initial_states or return_final_states"
)
device
=
"cuda"
rtol
,
atol
=
(
3e-4
,
1e-3
)
if
itype
==
torch
.
float32
else
(
3e-3
,
5e-3
)
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
# set seed
torch
.
random
.
manual_seed
(
0
)
if
not
channel_last
:
x
=
torch
.
randn
(
batch
,
4096
+
dim
+
64
,
seqlen
,
device
=
device
,
dtype
=
itype
)[:,
4096
:
4096
+
dim
,
:]
else
:
x
=
rearrange
(
torch
.
randn
(
batch
,
seqlen
,
4096
+
dim
+
64
,
device
=
device
,
dtype
=
itype
)[:,
:,
4096
:
4096
+
dim
],
"b s d -> b d s"
)
weight
=
torch
.
randn
(
dim
,
width
,
device
=
device
,
dtype
=
itype
)
bias
=
torch
.
randn
(
dim
,
device
=
device
,
dtype
=
itype
)
if
has_bias
else
None
if
has_initial_states
:
initial_states
=
torch
.
randn
(
batch
,
width
-
1
,
dim
,
device
=
device
,
dtype
=
itype
).
transpose
(
1
,
2
)
else
:
initial_states
=
None
x_ref
=
x
.
detach
().
clone
()
weight_ref
=
weight
.
detach
().
clone
()
bias_ref
=
bias
.
detach
().
clone
()
if
bias
is
not
None
else
None
initial_states_ref
=
initial_states
.
detach
().
clone
(
)
if
initial_states
is
not
None
else
None
activation
=
None
if
not
silu_activation
else
"silu"
out
,
final_states
=
causal_conv1d_fn
(
x
,
weight
,
bias
,
initial_states
=
initial_states
,
return_final_states
=
return_final_states
,
activation
=
activation
)
out_ref
,
final_states_ref
=
causal_conv1d_ref
(
x_ref
,
weight_ref
,
bias_ref
,
initial_states
=
initial_states_ref
,
return_final_states
=
return_final_states
,
activation
=
activation
)
if
return_final_states
:
assert
final_states
is
not
None
and
final_states_ref
is
not
None
assert
torch
.
allclose
(
final_states
,
final_states_ref
,
rtol
=
rtol
,
atol
=
atol
)
assert
torch
.
allclose
(
out
,
out_ref
,
rtol
=
rtol
,
atol
=
atol
)
if
return_final_states
:
out
+=
F
.
sigmoid
(
final_states
).
sum
(
dim
=-
1
,
keepdim
=
True
)
out_ref
+=
F
.
sigmoid
(
final_states_ref
).
sum
(
dim
=-
1
,
keepdim
=
True
)
@
pytest
.
mark
.
parametrize
(
"itype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"silu_activation"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"has_bias"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"width"
,
[
2
,
3
,
4
])
@
pytest
.
mark
.
parametrize
(
"dim"
,
[
2048
,
2048
+
16
,
4096
])
@
pytest
.
mark
.
parametrize
(
"batch"
,
[
1
,
2
])
def
test_causal_conv1d_update
(
batch
,
dim
,
width
,
has_bias
,
silu_activation
,
itype
):
device
=
"cuda"
rtol
,
atol
=
(
3e-4
,
1e-3
)
if
itype
==
torch
.
float32
else
(
3e-3
,
5e-3
)
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
# set seed
torch
.
random
.
manual_seed
(
0
)
batch
=
2
x
=
torch
.
randn
(
batch
,
dim
,
device
=
device
,
dtype
=
itype
)
conv_state
=
torch
.
randn
(
batch
,
dim
,
width
,
device
=
device
,
dtype
=
itype
)
weight
=
torch
.
randn
(
dim
,
width
,
device
=
device
,
dtype
=
itype
,
requires_grad
=
True
)
if
has_bias
:
bias
=
torch
.
randn
(
dim
,
device
=
device
,
dtype
=
itype
,
requires_grad
=
True
)
else
:
bias
=
None
conv_state_ref
=
conv_state
.
detach
().
clone
()
activation
=
None
if
not
silu_activation
else
"silu"
out
=
causal_conv1d_update
(
x
,
conv_state
,
weight
,
bias
,
activation
=
activation
)
out_ref
=
causal_conv1d_update_ref
(
x
,
conv_state_ref
,
weight
,
bias
,
activation
=
activation
)
assert
torch
.
equal
(
conv_state
,
conv_state_ref
)
assert
torch
.
allclose
(
out
,
out_ref
,
rtol
=
rtol
,
atol
=
atol
)
tests/kernels/test_flashinfer.py
View file @
0640f227
...
...
@@ -73,11 +73,14 @@ def ref_paged_attn(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
30.0
,
50.0
])
@
torch
.
inference_mode
def
test_flashinfer_decode_with_paged_kv
(
kv_lens
:
List
[
int
],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
def
test_flashinfer_decode_with_paged_kv
(
kv_lens
:
List
[
int
],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
],
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
num_seqs
=
len
(
kv_lens
)
...
...
@@ -88,6 +91,7 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
scale
=
head_size
**-
0.5
query
=
torch
.
randn
(
num_seqs
,
num_query_heads
,
head_size
,
dtype
=
dtype
)
key_value_cache
=
torch
.
randn
(
NUM_BLOCKS
,
2
,
block_size
,
...
...
@@ -125,7 +129,7 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
wrapper
=
flashinfer
.
\
BatchDecodeWithPagedKVCacheWrapper
(
workspace_buffer
,
"NHD"
,
use_tensor_cores
=
(
(
num_query_heads
//
num_kv_heads
)
not
in
(
1
,
2
,
4
,
8
)
)
(
num_query_heads
//
num_kv_heads
)
>
4
)
)
wrapper
.
begin_forward
(
kv_indptr
,
kv_indices
,
...
...
@@ -249,3 +253,215 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
soft_cap
=
soft_cap
)
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
),
\
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
@
pytest
.
mark
.
parametrize
(
"seq_lens"
,
[[(
1
,
132
),
(
5
,
18
)]])
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
[(
32
,
8
),
(
6
,
1
)])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
30.0
,
50.0
])
def
test_flashinfer_prefill_with_paged_fp8_kv
(
seq_lens
:
List
[
Tuple
[
int
,
int
]],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
assert
num_query_heads
%
num_kv_heads
==
0
max_kv_len
=
max
(
kv_lens
)
scale
=
head_size
**-
0.5
kv_cache_dtype
=
torch
.
float8_e4m3fn
query
=
torch
.
randn
(
sum
(
query_lens
),
num_query_heads
,
head_size
,
dtype
=
dtype
)
NUM_BLOCKS_FP8
=
2048
key_value_cache
=
torch
.
randn
(
NUM_BLOCKS_FP8
,
2
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
key_cache
,
value_cache
=
torch
.
chunk
(
key_value_cache
,
2
,
dim
=
1
)
key_cache
/=
head_size
**
0.5
value_cache
/=
head_size
**
0.5
k_scale
=
key_cache
.
amax
().
item
()
/
448.0
v_scale
=
value_cache
.
amax
().
item
()
/
448.0
kv_cache_fp8
=
torch
.
cat
([
key_cache
/
k_scale
,
value_cache
/
v_scale
],
dim
=
1
).
to
(
kv_cache_dtype
)
assert
(
kv_cache_fp8
.
shape
==
key_value_cache
.
shape
)
max_num_blocks_per_seq
=
(
max_kv_len
+
block_size
-
1
)
//
block_size
block_tables
=
torch
.
randint
(
0
,
NUM_BLOCKS_FP8
,
(
num_seqs
,
max_num_blocks_per_seq
),
dtype
=
torch
.
int32
)
qo_indptr
=
[
0
]
kv_indptr
=
[
0
]
kv_indices
=
[]
kv_last_page_lens
=
[]
for
i
in
range
(
num_seqs
):
seq_len
=
kv_lens
[
i
]
assert
seq_len
>
0
num_blocks
=
(
seq_len
+
block_size
-
1
)
//
block_size
kv_indices
.
extend
(
block_tables
[
i
,
:
num_blocks
])
kv_indptr
.
append
(
kv_indptr
[
-
1
]
+
num_blocks
)
kv_last_page_len
=
seq_len
%
block_size
if
kv_last_page_len
==
0
:
kv_last_page_len
=
block_size
kv_last_page_lens
.
append
(
kv_last_page_len
)
qo_indptr
.
append
(
qo_indptr
[
-
1
]
+
query_lens
[
i
])
qo_indptr
=
torch
.
tensor
(
qo_indptr
,
dtype
=
torch
.
int32
)
kv_indptr
=
torch
.
tensor
(
kv_indptr
,
dtype
=
torch
.
int32
)
kv_indices
=
torch
.
tensor
(
kv_indices
,
dtype
=
torch
.
int32
)
kv_last_page_lens
=
torch
.
tensor
(
kv_last_page_lens
,
dtype
=
torch
.
int32
)
workspace_buffer
=
torch
.
empty
(
128
*
1024
*
1024
,
dtype
=
torch
.
int8
)
wrapper
=
flashinfer
.
BatchPrefillWithPagedKVCacheWrapper
(
workspace_buffer
,
"NHD"
)
wrapper
.
begin_forward
(
qo_indptr
,
kv_indptr
,
kv_indices
,
kv_last_page_lens
,
num_query_heads
,
num_kv_heads
,
head_size
,
block_size
,
)
output
=
wrapper
.
forward
(
query
,
kv_cache_fp8
,
logits_soft_cap
=
soft_cap
,
k_scale
=
k_scale
,
v_scale
=
v_scale
)
ref_output
=
ref_paged_attn
(
query
=
query
,
key_cache
=
key_cache
.
squeeze
(
1
),
value_cache
=
value_cache
.
squeeze
(
1
),
query_lens
=
query_lens
,
kv_lens
=
kv_lens
,
block_tables
=
block_tables
,
scale
=
scale
,
soft_cap
=
soft_cap
)
del
query
del
block_tables
# verify prefill fp8
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
),
\
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
@
pytest
.
mark
.
parametrize
(
"kv_lens"
,
[[
1328
,
18
,
463
],
[
1
,
54
,
293
,
70
]])
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
[(
32
,
8
),
(
64
,
8
),
(
6
,
1
)])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
30.0
,
50.0
])
@
torch
.
inference_mode
def
test_flashinfer_decode_with_paged_fp8_kv
(
kv_lens
:
List
[
int
],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
],
)
->
None
:
# test doesn't work for num_heads = (16,16)
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
assert
num_query_heads
%
num_kv_heads
==
0
max_kv_len
=
max
(
kv_lens
)
scale
=
head_size
**-
0.5
use_tensor_cores
=
(
num_query_heads
//
num_kv_heads
)
>
4
kv_cache_dtype
=
torch
.
float8_e4m3fn
query
=
torch
.
randn
(
num_seqs
,
num_query_heads
,
head_size
,
dtype
=
dtype
)
NUM_BLOCKS_FP8
=
2048
key_value_cache
=
torch
.
randn
(
NUM_BLOCKS_FP8
,
2
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
key_cache
,
value_cache
=
torch
.
chunk
(
key_value_cache
,
2
,
dim
=
1
)
key_cache
/=
head_size
**
0.5
value_cache
/=
head_size
**
0.5
k_scale
=
key_cache
.
amax
().
item
()
/
448.0
v_scale
=
value_cache
.
amax
().
item
()
/
448.0
key_cache_fp8
=
(
key_cache
/
k_scale
).
to
(
kv_cache_dtype
)
value_cache_fp8
=
(
value_cache
/
v_scale
).
to
(
kv_cache_dtype
)
assert
(
key_cache_fp8
.
shape
[
1
]
==
1
and
value_cache_fp8
.
shape
[
1
]
==
1
)
kv_cache_fp8
=
torch
.
cat
([
key_cache_fp8
,
value_cache_fp8
],
dim
=
1
)
max_num_blocks_per_seq
=
(
max_kv_len
+
block_size
-
1
)
//
block_size
block_tables
=
torch
.
randint
(
0
,
NUM_BLOCKS_FP8
,
(
num_seqs
,
max_num_blocks_per_seq
),
dtype
=
torch
.
int32
)
kv_indptr
=
[
0
]
kv_indices
=
[]
kv_last_page_lens
=
[]
for
i
in
range
(
num_seqs
):
seq_len
=
kv_lens
[
i
]
assert
seq_len
>
0
num_blocks
=
(
seq_len
+
block_size
-
1
)
//
block_size
kv_indices
.
extend
(
block_tables
[
i
,
:
num_blocks
])
kv_indptr
.
append
(
kv_indptr
[
-
1
]
+
num_blocks
)
kv_last_page_len
=
seq_len
%
block_size
if
kv_last_page_len
==
0
:
kv_last_page_len
=
block_size
kv_last_page_lens
.
append
(
kv_last_page_len
)
kv_indptr
=
torch
.
tensor
(
kv_indptr
,
dtype
=
torch
.
int32
)
kv_indices
=
torch
.
tensor
(
kv_indices
,
dtype
=
torch
.
int32
)
kv_last_page_lens
=
torch
.
tensor
(
kv_last_page_lens
,
dtype
=
torch
.
int32
)
workspace_buffer
=
torch
.
empty
(
128
*
1024
*
1024
,
dtype
=
torch
.
int8
)
wrapper
=
flashinfer
.
\
BatchDecodeWithPagedKVCacheWrapper
(
workspace_buffer
,
"NHD"
,
use_tensor_cores
=
use_tensor_cores
)
wrapper
.
begin_forward
(
kv_indptr
,
kv_indices
,
kv_last_page_lens
,
num_query_heads
,
num_kv_heads
,
head_size
,
block_size
,
"NONE"
,
data_type
=
dtype
)
output
=
wrapper
.
forward
(
query
,
kv_cache_fp8
,
logits_soft_cap
=
soft_cap
,
k_scale
=
k_scale
,
v_scale
=
v_scale
)
key_cache
=
key_value_cache
[:,
0
,
:,
:,
:].
squeeze
(
1
)
value_cache
=
key_value_cache
[:,
1
,
:,
:,
:].
squeeze
(
1
)
ref_output
=
ref_paged_attn
(
query
=
query
,
key_cache
=
key_cache
,
value_cache
=
value_cache
,
query_lens
=
[
1
]
*
num_seqs
,
kv_lens
=
kv_lens
,
block_tables
=
block_tables
,
scale
=
scale
,
soft_cap
=
soft_cap
)
# Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
2e-2
,
rtol
=
1e-2
),
\
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
tests/kernels/test_mamba_ssm.py
0 → 100644
View file @
0640f227
import
pytest
import
torch
import
torch.nn.functional
as
F
from
einops
import
rearrange
,
repeat
from
vllm.model_executor.layers.mamba.ops.mamba_ssm
import
(
selective_scan_fn
,
selective_state_update
)
def
selective_state_update_ref
(
state
,
x
,
dt
,
A
,
B
,
C
,
D
=
None
,
z
=
None
,
dt_bias
=
None
,
dt_softplus
=
False
):
"""
Argument:
state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
x: (batch, dim) or (batch, nheads, dim)
dt: (batch, dim) or (batch, nheads, dim)
A: (dim, dstate) or (nheads, dim, dstate)
B: (batch, dstate) or (batch, ngroups, dstate)
C: (batch, dstate) or (batch, ngroups, dstate)
D: (dim,) or (nheads, dim)
z: (batch, dim) or (batch, nheads, dim)
dt_bias: (dim,) or (nheads, dim)
Return:
out: (batch, dim) or (batch, nheads, dim)
"""
has_heads
=
state
.
dim
()
>
3
if
state
.
dim
()
==
3
:
state
=
state
.
unsqueeze
(
1
)
if
x
.
dim
()
==
2
:
x
=
x
.
unsqueeze
(
1
)
if
dt
.
dim
()
==
2
:
dt
=
dt
.
unsqueeze
(
1
)
if
A
.
dim
()
==
2
:
A
=
A
.
unsqueeze
(
0
)
if
B
.
dim
()
==
2
:
B
=
B
.
unsqueeze
(
1
)
if
C
.
dim
()
==
2
:
C
=
C
.
unsqueeze
(
1
)
if
D
is
not
None
and
D
.
dim
()
==
1
:
D
=
D
.
unsqueeze
(
0
)
if
z
is
not
None
and
z
.
dim
()
==
2
:
z
=
z
.
unsqueeze
(
1
)
if
dt_bias
is
not
None
and
dt_bias
.
dim
()
==
1
:
dt_bias
=
dt_bias
.
unsqueeze
(
0
)
batch
,
nheads
,
dim
,
dstate
=
state
.
shape
assert
x
.
shape
==
(
batch
,
nheads
,
dim
)
assert
dt
.
shape
==
x
.
shape
assert
A
.
shape
==
(
nheads
,
dim
,
dstate
)
ngroups
=
B
.
shape
[
1
]
assert
nheads
%
ngroups
==
0
,
"nheads must be divisible by ngroups"
assert
B
.
shape
==
(
batch
,
ngroups
,
dstate
)
assert
C
.
shape
==
B
.
shape
if
D
is
not
None
:
assert
D
.
shape
==
(
nheads
,
dim
)
if
z
is
not
None
:
assert
z
.
shape
==
x
.
shape
if
dt_bias
is
not
None
:
assert
dt_bias
.
shape
==
(
nheads
,
dim
)
dt
=
dt
+
dt_bias
dt
=
F
.
softplus
(
dt
)
if
dt_softplus
else
dt
dA
=
torch
.
exp
(
rearrange
(
dt
,
"b h d -> b h d 1"
)
*
A
)
# (batch, nheads, dim, dstate)
B
=
repeat
(
B
,
"b g n -> b (g h) n"
,
h
=
nheads
//
ngroups
)
# (batch, nheads, dstate)
C
=
repeat
(
C
,
"b g n -> b (g h) n"
,
h
=
nheads
//
ngroups
)
# (batch, nheads, dstate)
dB
=
rearrange
(
dt
,
"b h d -> b h d 1"
)
*
rearrange
(
B
,
"b h n -> b h 1 n"
)
# (batch, nheads, dim, dstate)
state
.
copy_
(
state
*
dA
+
dB
*
rearrange
(
x
,
"b h d -> b h d 1"
))
# (batch, dim, dstate
out
=
torch
.
einsum
(
"bhdn,bhn->bhd"
,
state
.
to
(
C
.
dtype
),
C
)
if
D
is
not
None
:
out
+=
(
x
*
D
).
to
(
out
.
dtype
)
out
=
(
out
if
z
is
None
else
out
*
F
.
silu
(
z
)).
to
(
x
.
dtype
)
if
not
has_heads
:
out
=
out
.
squeeze
(
1
)
return
out
def
selective_scan_ref
(
u
,
delta
,
A
,
B
,
C
,
D
=
None
,
z
=
None
,
delta_bias
=
None
,
delta_softplus
=
False
,
return_last_state
=
False
,
position_indices
=
None
,
prev_state
=
None
):
"""
u: r(B D L)
delta: r(B D L)
A: c(D N) or r(D N)
B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
D: r(D)
z: r(B D L)
delta_bias: r(D), fp32
prev_state: r(B D N), fp32
out: r(B D L)
last_state (optional): r(B D dstate) or c(B D dstate)
"""
dtype_in
=
u
.
dtype
u
=
u
.
float
()
delta
=
delta
.
float
()
if
delta_bias
is
not
None
:
delta
=
delta
+
delta_bias
[...,
None
].
float
()
if
delta_softplus
:
delta
=
F
.
softplus
(
delta
)
batch
,
dim
,
dstate
=
u
.
shape
[
0
],
A
.
shape
[
0
],
A
.
shape
[
1
]
is_variable_B
=
B
.
dim
()
>=
3
is_variable_C
=
C
.
dim
()
>=
3
B
=
B
.
float
()
C
=
C
.
float
()
x
=
A
.
new_zeros
((
batch
,
dim
,
dstate
))
if
prev_state
is
None
else
prev_state
ys
=
[]
deltaA
=
torch
.
exp
(
torch
.
einsum
(
'bdl,dn->bdln'
,
delta
,
A
))
if
not
is_variable_B
:
deltaB_u
=
torch
.
einsum
(
'bdl,dn,bdl->bdln'
,
delta
,
B
,
u
)
else
:
if
B
.
dim
()
==
3
:
deltaB_u
=
torch
.
einsum
(
'bdl,bnl,bdl->bdln'
,
delta
,
B
,
u
)
else
:
B
=
repeat
(
B
,
"B G N L -> B (G H) N L"
,
H
=
dim
//
B
.
shape
[
1
])
deltaB_u
=
torch
.
einsum
(
'bdl,bdnl,bdl->bdln'
,
delta
,
B
,
u
)
if
is_variable_C
and
C
.
dim
()
==
4
:
C
=
repeat
(
C
,
"B G N L -> B (G H) N L"
,
H
=
dim
//
C
.
shape
[
1
])
last_state
=
None
for
i
in
range
(
u
.
shape
[
2
]):
if
position_indices
is
not
None
and
position_indices
[
0
,
i
]
==
0
:
x
=
deltaB_u
[:,
:,
i
]
else
:
x
=
deltaA
[:,
:,
i
]
*
x
+
deltaB_u
[:,
:,
i
]
if
not
is_variable_C
:
y
=
torch
.
einsum
(
'bdn,dn->bd'
,
x
,
C
)
else
:
if
C
.
dim
()
==
3
:
y
=
torch
.
einsum
(
'bdn,bn->bd'
,
x
,
C
[:,
:,
i
])
else
:
y
=
torch
.
einsum
(
'bdn,bdn->bd'
,
x
,
C
[:,
:,
:,
i
])
if
i
==
u
.
shape
[
2
]
-
1
:
last_state
=
x
ys
.
append
(
y
)
y
=
torch
.
stack
(
ys
,
dim
=
2
)
# (batch dim L)
out
=
y
if
D
is
None
else
y
+
u
*
rearrange
(
D
,
"d -> d 1"
)
if
z
is
not
None
:
out
=
out
*
F
.
silu
(
z
)
out
=
out
.
to
(
dtype
=
dtype_in
)
return
out
if
not
return_last_state
else
(
out
,
last_state
)
@
pytest
.
mark
.
parametrize
(
'wtype'
,
[
torch
.
float32
])
@
pytest
.
mark
.
parametrize
(
'itype'
,
[
torch
.
float32
])
@
pytest
.
mark
.
parametrize
(
'seqlen'
,
[
128
,
256
,
512
,
1024
,
2048
,
4096
])
@
pytest
.
mark
.
parametrize
(
"return_last_state"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
'has_delta_bias'
,
[
True
])
@
pytest
.
mark
.
parametrize
(
'delta_softplus'
,
[
True
])
@
pytest
.
mark
.
parametrize
(
'has_z'
,
[
True
])
@
pytest
.
mark
.
parametrize
(
'has_D'
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"varBC_groups"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"is_variable_C"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"is_variable_B"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"scan_chunks"
,
[
1
,
2
,
3
])
def
test_selective_scan
(
is_variable_B
,
is_variable_C
,
varBC_groups
,
has_D
,
has_z
,
has_delta_bias
,
delta_softplus
,
return_last_state
,
seqlen
,
itype
,
wtype
,
scan_chunks
):
if
varBC_groups
>
1
and
(
not
is_variable_B
or
not
is_variable_C
):
pytest
.
skip
()
# This config is not applicable
device
=
'cuda'
rtol
,
atol
=
(
6e-4
,
2e-3
)
if
itype
==
torch
.
float32
else
(
3e-3
,
5e-3
)
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
3e-2
,
5e-2
rtolw
,
atolw
=
(
1e-3
,
1e-3
)
if
has_z
:
# If we have z, the errors on the weights seem higher
rtolw
=
max
(
rtolw
,
rtol
)
atolw
=
max
(
atolw
,
atol
)
# set seed
torch
.
random
.
manual_seed
(
0
)
batch_size
=
2
dim
=
4
dstate
=
8
A
=
(
-
0.5
*
torch
.
rand
(
dim
,
dstate
,
device
=
device
,
dtype
=
wtype
))
if
not
is_variable_B
:
B_shape
=
[
dim
,
dstate
]
elif
varBC_groups
==
1
:
B_shape
=
[
batch_size
,
dstate
,
seqlen
]
else
:
B_shape
=
[
batch_size
,
varBC_groups
,
dstate
,
seqlen
]
B
=
torch
.
randn
(
B_shape
,
device
=
device
,
dtype
=
wtype
if
not
is_variable_B
else
itype
)
if
not
is_variable_C
:
C_shape
=
[
dim
,
dstate
]
elif
varBC_groups
==
1
:
C_shape
=
[
batch_size
,
dstate
,
seqlen
]
else
:
C_shape
=
[
batch_size
,
varBC_groups
,
dstate
,
seqlen
]
C
=
torch
.
randn
(
C_shape
,
device
=
device
,
dtype
=
wtype
if
not
is_variable_C
else
itype
)
D
=
torch
.
randn
(
dim
,
device
=
device
,
dtype
=
torch
.
float32
)
if
has_D
else
None
z
=
torch
.
randn
(
batch_size
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
)
if
has_z
else
None
delta_bias
=
(
0.5
*
torch
.
rand
(
dim
,
device
=
device
,
dtype
=
torch
.
float32
)
)
if
has_delta_bias
else
None
u
=
torch
.
randn
(
batch_size
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
)
delta
=
(
0.5
*
torch
.
rand
(
batch_size
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
))
state
=
None
state_ref
=
None
out
=
None
out_ref
=
None
outs
=
[]
for
c
in
range
(
scan_chunks
):
chunked_prompt_len
=
seqlen
//
scan_chunks
chunk_start
=
chunked_prompt_len
*
c
chunk_end
=
chunked_prompt_len
*
(
c
+
1
)
if
c
==
scan_chunks
-
1
:
chunk_end
=
seqlen
_B
=
B
if
is_variable_B
:
_B
=
B
[...,
chunk_start
:
chunk_end
]
_C
=
C
if
is_variable_B
:
_C
=
C
[...,
chunk_start
:
chunk_end
]
_z
=
z
if
has_z
:
assert
z
is
not
None
_z
=
z
[...,
chunk_start
:
chunk_end
]
out
,
*
rest
=
selective_scan_fn
(
u
[...,
chunk_start
:
chunk_end
],
delta
[...,
chunk_start
:
chunk_end
],
A
,
_B
,
_C
,
D
,
z
=
_z
,
delta_bias
=
delta_bias
,
delta_softplus
=
delta_softplus
,
return_last_state
=
return_last_state
,
prev_state
=
state
if
c
>
0
else
None
)
outs
.
append
(
out
)
if
return_last_state
:
state
=
rest
[
0
]
if
len
(
outs
)
>
1
:
out
=
torch
.
cat
(
outs
,
dim
=-
1
)
out_ref
,
*
rest
=
selective_scan_ref
(
u
,
delta
,
A
,
B
,
C
,
D
,
z
=
z
,
delta_bias
=
delta_bias
,
delta_softplus
=
delta_softplus
,
return_last_state
=
return_last_state
)
if
return_last_state
:
state_ref
=
rest
[
0
]
assert
out
is
not
None
and
out_ref
is
not
None
assert
torch
.
allclose
(
out
,
out_ref
,
rtol
=
rtol
,
atol
=
atol
)
if
return_last_state
:
assert
state
is
not
None
and
state_ref
is
not
None
assert
torch
.
allclose
(
state
,
state_ref
,
rtol
=
rtol
,
atol
=
atol
)
@
pytest
.
mark
.
parametrize
(
"itype"
,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"has_z"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"dstate"
,
[
16
,
32
,
64
])
@
pytest
.
mark
.
parametrize
(
"dim"
,
[
2048
,
2048
+
16
,
4096
])
def
test_selective_state_update
(
dim
,
dstate
,
has_z
,
itype
):
device
=
"cuda"
rtol
,
atol
=
(
3e-4
,
1e-3
)
if
itype
==
torch
.
float32
else
(
5e-3
,
1e-2
)
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
if
torch
.
version
.
hip
:
atol
*=
2
# set seed
torch
.
random
.
manual_seed
(
0
)
batch_size
=
1
state
=
torch
.
randn
(
batch_size
,
dim
,
dstate
,
dtype
=
itype
,
device
=
device
)
x
=
torch
.
randn
(
batch_size
,
dim
,
device
=
device
,
dtype
=
itype
)
dt
=
torch
.
randn
(
batch_size
,
dim
,
device
=
device
,
dtype
=
itype
)
dt_bias
=
torch
.
rand
(
dim
,
device
=
device
)
-
4.0
A
=
-
torch
.
rand
(
dim
,
dstate
,
device
=
device
)
-
1.0
B
=
torch
.
randn
(
batch_size
,
dstate
,
device
=
device
)
C
=
torch
.
randn
(
batch_size
,
dstate
,
device
=
device
)
D
=
torch
.
randn
(
dim
,
device
=
device
)
z
=
torch
.
randn_like
(
x
)
if
has_z
else
None
state_ref
=
state
.
detach
().
clone
()
out
=
selective_state_update
(
state
,
x
,
dt
,
A
,
B
,
C
,
D
=
D
,
z
=
z
,
dt_bias
=
dt_bias
,
dt_softplus
=
True
)
out_ref
=
selective_state_update_ref
(
state_ref
,
x
,
dt
,
A
,
B
,
C
,
D
=
D
,
z
=
z
,
dt_bias
=
dt_bias
,
dt_softplus
=
True
)
assert
torch
.
allclose
(
state
,
state_ref
,
rtol
=
rtol
,
atol
=
atol
)
assert
torch
.
allclose
(
out
,
out_ref
,
rtol
=
rtol
,
atol
=
atol
)
tests/lora/test_gemma.py
View file @
0640f227
from
typing
import
List
import
pytest
import
vllm
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
is_hip
MODEL_PATH
=
"google/gemma-7b"
...
...
@@ -10,7 +13,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
prompts
=
[
"Quote: Imagination is"
,
"Quote: Be yourself;"
,
"Quote:
So many books
,"
,
"Quote:
Painting is poetry that is seen rather than felt
,"
,
]
sampling_params
=
vllm
.
SamplingParams
(
temperature
=
0
,
max_tokens
=
32
)
outputs
=
llm
.
generate
(
...
...
@@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return
generated_texts
@
pytest
.
mark
.
xfail
(
is_hip
(),
reason
=
"There can be output mismatch on ROCm"
)
def
test_gemma_lora
(
gemma_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
...
...
@@ -37,7 +41,8 @@ def test_gemma_lora(gemma_lora_files):
expected_lora_output
=
[
"more important than knowledge.
\n
Author: Albert Einstein
\n
"
,
"everyone else is already taken.
\n
Author: Oscar Wilde
\n
"
,
"so little time.
\n
Author: Frank Zappa
\n
"
,
"and poetry is painting that is felt rather than seen.
\n
"
"Author: Leonardo da Vinci
\n
"
,
]
output1
=
do_sample
(
llm
,
gemma_lora_files
,
lora_id
=
1
)
...
...
tests/lora/test_quant_model.py
View file @
0640f227
...
...
@@ -7,6 +7,7 @@ import pytest
import
vllm
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
is_hip
from
.conftest
import
cleanup
...
...
@@ -17,12 +18,23 @@ class ModelWithQuantization:
quantization
:
str
MODELS
:
List
[
ModelWithQuantization
]
=
[
ModelWithQuantization
(
model_path
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
quantization
=
"AWQ"
),
ModelWithQuantization
(
model_path
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
quantization
=
"GPTQ"
),
]
MODELS
:
List
[
ModelWithQuantization
]
#AWQ quantization is currently not supported in ROCm.
if
is_hip
():
MODELS
=
[
ModelWithQuantization
(
model_path
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
quantization
=
"GPTQ"
),
]
else
:
MODELS
=
[
ModelWithQuantization
(
model_path
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
quantization
=
"AWQ"
),
ModelWithQuantization
(
model_path
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
quantization
=
"GPTQ"
),
]
def
do_sample
(
llm
:
vllm
.
LLM
,
...
...
tests/models/test_fp8.py
View file @
0640f227
...
...
@@ -3,116 +3,97 @@
Note: these tests will only pass on L4 GPU.
"""
import
os
from
typing
import
List
from
typing
import
Optional
import
pytest
import
torch
from
transformers
import
AutoTokenizer
from
tests.kernels.utils
import
override_backend_env_variable
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
MAX_MODEL_LEN
=
1024
MODELS
=
[
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"meta-llama/Meta-Llama-3-8B-Instruct"
,
]
from
..models.utils
import
check_logprobs_close
EXPECTED_STRS_MAP
=
{
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV"
:
{
"auto"
:
[
'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models ('
,
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to '
,
'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both'
,
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne'
,
'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep'
,
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The'
,
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of'
,
'Here are the translations:
\n\n
**Japanese:** (Haya aki no tori, nemuri no'
],
"fp8"
:
[
'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained'
,
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to '
,
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.'
,
'A neural network is a complex system made up of several basic components that work together to enable it to'
,
'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like'
,
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here'
,
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of'
,
'Here are the translations:
\n\n
**Japanese:** (Haya kotori wa mushi o tsuk'
]
},
"meta-llama/Meta-Llama-3-8B-Instruct"
:
{
"auto"
:
[
'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained'
,
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to '
,
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.'
,
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne'
,
'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short'
,
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The'
,
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of'
,
'Here are the translations:
\n\n
**Japanese:** (Haya aki wa mushi o tsukamu'
],
"fp8"
:
[
'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained'
,
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to '
,
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.'
,
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne'
,
'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest'
,
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The'
,
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of'
,
'Here are the translations:
\n\n
**Japanese:** (Haya tori, mushi o tsukamu'
]
},
}
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
# This test compares against golden strings for exact match since
# there is no baseline implementation to compare against
# and is unstable w.r.t specifics of the fp8 implementation or
# the hardware being run on.
# Disabled to prevent it from breaking the build
@
pytest
.
mark
.
skip
(
reason
=
"Prevent unstable test based on golden strings from breaking the build."
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
def
test_models
(
example_prompts
,
model_name
,
kv_cache_dtype
)
->
None
:
model
=
LLM
(
model
=
model_name
,
max_model_len
=
MAX_MODEL_LEN
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
quantization
=
"fp8"
,
kv_cache_dtype
=
kv_cache_dtype
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype,base_model,test_model,scale_path"
,
[
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
(
"fp8_e4m3"
,
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV"
,
None
),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
(
"fp8_e5m2"
,
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"meta-llama/Meta-Llama-3-8B-Instruct"
,
None
),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
(
"fp8_e4m3"
,
"meta-llama/Llama-2-7b-chat-hf"
,
"meta-llama/Llama-2-7b-chat-hf"
,
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
)
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"XFORMERS"
,
"FLASHINFER"
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
# Due to low-precision numerical divergence, this test is too sensitive for
# the async postprocessor
@
pytest
.
mark
.
parametrize
(
"disable_async_output_proc"
,
[
True
])
def
test_models
(
vllm_runner
,
example_prompts
,
kv_cache_dtype
:
str
,
base_model
:
str
,
test_model
:
str
,
scale_path
:
Optional
[
str
],
max_tokens
:
int
,
enforce_eager
:
bool
,
backend
:
str
,
tensor_parallel_size
:
int
,
disable_async_output_proc
:
bool
,
monkeypatch
,
)
->
None
:
"""
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
override_backend_env_variable
(
monkeypatch
,
backend
)
MAX_MODEL_LEN
=
1024
NUM_LOG_PROBS
=
8
with
vllm_runner
(
base_model
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
"auto"
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
formatted_prompts
=
[
tokenizer
.
apply_chat_template
([{
"role"
:
"user"
,
"content"
:
prompt
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
for
prompt
in
example_prompts
]
extra_kwargs
=
{}
if
scale_path
is
not
None
:
extra_kwargs
[
"quantization_param_path"
]
=
scale_path
params
=
SamplingParams
(
max_tokens
=
20
,
temperature
=
0
)
generations
:
List
[
str
]
=
[]
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
outputs
=
model
.
generate
(
prompt
,
params
)
generations
.
append
(
outputs
[
0
].
outputs
[
0
].
text
)
del
model
with
vllm_runner
(
test_model
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
**
extra_kwargs
,
)
as
vllm_model
:
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
print
(
model_name
,
kv_cache_dtype
,
generations
)
expected_strs
=
EXPECTED_STRS_MAP
[
model_name
][
kv_cache_dtype
]
for
i
in
range
(
len
(
example_prompts
)):
generated_str
=
generations
[
i
]
expected_str
=
expected_strs
[
i
]
assert
expected_str
==
generated_str
,
(
f
"Test
{
i
}
:
\n
Expected:
{
expected_str
!
r
}
\n
vLLM:
{
generated_str
!
r
}
"
)
check_logprobs_close
(
outputs_0_lst
=
baseline_outputs
,
outputs_1_lst
=
test_outputs
,
name_0
=
"fp16_kv_cache"
,
name_1
=
"fp8_kv_cache"
,
)
tests/models/test_granite.py
0 → 100644
View file @
0640f227
"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
Run `pytest tests/models/test_granite.py`.
"""
import
importlib.metadata
import
pytest
from
.utils
import
check_logprobs_close
TRANSFORMERS_VERSION
=
tuple
(
map
(
int
,
importlib
.
metadata
.
version
(
"transformers"
).
split
(
"."
)))
MODELS
=
[
"ibm/PowerLM-3b"
,
]
# GraniteForCausalLM will be in transformers >= 4.45
@
pytest
.
mark
.
skipif
(
TRANSFORMERS_VERSION
<
(
4
,
45
),
reason
=
"granite model test requires transformers >= 4.45"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
# TODO(sang): Sliding window should be tested separately.
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/models/test_intern_vit.py
View file @
0640f227
...
...
@@ -6,8 +6,6 @@ import torch.nn as nn
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoConfig
,
AutoModel
,
CLIPImageProcessor
from
vllm.model_executor.models.intern_vit
import
InternVisionModel
from
..conftest
import
_ImageAssets
,
cleanup
pytestmark
=
pytest
.
mark
.
vlm
...
...
@@ -49,6 +47,7 @@ def run_intern_vit_test(
for
pixel_value
in
pixel_values
]
from
vllm.model_executor.models.intern_vit
import
InternVisionModel
vllm_model
=
InternVisionModel
(
config
)
vllm_model
.
load_weights
(
hf_model
.
state_dict
().
items
())
...
...
tests/models/test_internvl.py
View file @
0640f227
...
...
@@ -3,13 +3,9 @@ from typing import List, Optional, Tuple, Type
import
pytest
import
torch
from
huggingface_hub
import
snapshot_download
from
PIL.Image
import
Image
from
transformers
import
AutoConfig
from
vllm.model_executor.models.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values
)
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.utils
import
is_cpu
...
...
@@ -25,49 +21,15 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"<|im_start|>User
\n
<image>
\n
What is the season?<|im_end|>
\n
<|im_start|>Assistant
\n
"
,
# noqa: E501
})
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN
=
[
"*.json"
,
"*.py"
,
"*.safetensors"
,
"*.txt"
,
"*.model"
]
models
=
[
snapshot_download
(
"OpenGVLab/InternVL2-1B"
,
allow_patterns
=
DOWNLOAD_PATTERN
),
snapshot_download
(
"OpenGVLab/InternVL2-2B"
,
allow_patterns
=
DOWNLOAD_PATTERN
),
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL2-2B"
,
# Broken due to outdated implementation of Phi-3
# See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
#
snapshot_download(
"OpenGVLab/InternVL2-4B"
)
,
# "OpenGVLab/InternVL2-4B",
]
class
InternVLProcessor
:
"""A simple processor for InternVL2 HF model which misses a processor."""
def
__init__
(
self
,
hf_runner
:
HfRunner
):
self
.
num_image_token
=
hf_runner
.
model
.
num_image_token
self
.
tokenizer
=
hf_runner
.
tokenizer
self
.
dtype
=
hf_runner
.
model
.
dtype
self
.
config
=
AutoConfig
.
from_pretrained
(
hf_runner
.
model_name
)
self
.
vision_config
=
self
.
config
.
vision_config
self
.
use_thumbnail
=
self
.
config
.
use_thumbnail
self
.
min_num
=
self
.
config
.
min_dynamic_patch
self
.
max_num
=
self
.
config
.
max_dynamic_patch
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
,
**
kwargs
):
pixel_values
=
image_to_pixel_values
(
images
,
self
.
image_size
,
self
.
min_num
,
self
.
max_num
,
self
.
use_thumbnail
).
to
(
self
.
dtype
)
num_patches_list
=
[
pixel_values
.
shape
[
0
]]
for
num_patches
in
num_patches_list
:
context_tokens
=
IMG_CONTEXT
*
self
.
num_image_token
*
num_patches
image_tokens
=
IMG_START
+
context_tokens
+
IMG_END
text
=
text
.
replace
(
'<image>'
,
image_tokens
,
1
)
prompt
=
self
.
tokenizer
(
text
,
return_tensors
=
"pt"
)
prompt
.
update
({
"pixel_values"
:
pixel_values
})
return
prompt
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
def
generate
(
self
,
...
...
@@ -133,6 +95,37 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
class
InternVLProcessor
:
"""A simple processor for InternVL2 which misses a processor."""
def
__init__
(
self
,
hf_runner
:
HfRunner
):
self
.
num_image_token
=
hf_runner
.
model
.
num_image_token
self
.
tokenizer
=
hf_runner
.
tokenizer
self
.
dtype
=
hf_runner
.
model
.
dtype
self
.
config
=
AutoConfig
.
from_pretrained
(
hf_runner
.
model_name
)
self
.
vision_config
=
self
.
config
.
vision_config
self
.
use_thumbnail
=
self
.
config
.
use_thumbnail
self
.
min_num
=
self
.
config
.
min_dynamic_patch
self
.
max_num
=
self
.
config
.
max_dynamic_patch
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
,
**
kwargs
):
from
vllm.model_executor.models.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values
)
pixel_values
=
image_to_pixel_values
(
images
,
self
.
image_size
,
self
.
min_num
,
self
.
max_num
,
self
.
use_thumbnail
).
to
(
self
.
dtype
)
num_patches_list
=
[
pixel_values
.
shape
[
0
]]
for
num_patches
in
num_patches_list
:
context_tokens
=
IMG_CONTEXT
*
self
.
num_image_token
\
*
num_patches
image_tokens
=
IMG_START
+
context_tokens
+
IMG_END
text
=
text
.
replace
(
'<image>'
,
image_tokens
,
1
)
prompt
=
self
.
tokenizer
(
text
,
return_tensors
=
"pt"
)
prompt
.
update
({
"pixel_values"
:
pixel_values
})
return
prompt
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
max_model_len
=
4096
,
...
...
tests/models/test_llava.py
View file @
0640f227
...
...
@@ -179,3 +179,20 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
def
test_context_length_too_short
(
vllm_runner
,
image_assets
,
model
):
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
with
pytest
.
raises
(
ValueError
,
match
=
"too long to fit into the model"
):
vllm_model
=
vllm_runner
(
model
,
max_model_len
=
128
,
# LLaVA has a feature size of 576
enforce_eager
=
True
,
)
with
vllm_model
:
vllm_model
.
generate_greedy
([
HF_IMAGE_PROMPTS
[
0
]],
max_tokens
=
1
,
images
=
[
images
[
0
]])
tests/models/test_llava_next.py
View file @
0640f227
...
...
@@ -6,24 +6,22 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
..conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
_PREFACE
=
(
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's "
"questions."
)
_LIMIT_IMAGE_PER_PROMPT
=
4
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
f
"
{
_PREFACE
}
USER:
<image>
\n
What's the content of the image?
ASSISTANT:
"
,
"[INST]
<image>
\n
What's the content of the image?
[/INST]
"
,
"cherry_blossom"
:
f
"
{
_PREFACE
}
USER:
<image>
\n
What is the season?
ASSISTANT:
"
,
"[INST]
<image>
\n
What is the season?
[/INST]
"
,
})
models
=
[
"llava-hf/llava-v1.6-
vicuna
-7b-hf"
]
models
=
[
"llava-hf/llava-v1.6-
mistral
-7b-hf"
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
...
...
@@ -114,19 +112,43 @@ def run_test(
else
:
raise
ValueError
(
"You must provide either `size_factors` or `sizes`"
)
_run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
)
def
_run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
PromptImageInput
]],
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
40
96
,
max_model_len
=
102
40
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
})
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
_per_image
for
prompts
,
images
in
inputs
]
with
hf_runner
(
model
,
dtype
=
dtype
,
...
...
@@ -136,7 +158,7 @@ def run_test(
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
_per_image
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
...
...
@@ -177,7 +199,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
...
...
@@ -216,3 +238,48 @@ def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_multiple_image_inputs
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
cherry_blossom
=
image_assets
[
1
].
pil_image
inputs
=
[(
[
"[INST] <image><image>
\n
Describe 2 images. [/INST]"
,
"[INST] <image><image>
\n
Describe 2 images. [/INST]"
,
"[INST] <image><image><image><image>
\n
Describe 4 images. [/INST]"
,
"[INST] <image>
\n
What is the season? [/INST]"
],
[
[
stop_sign
,
cherry_blossom
],
# Images with different sizes and aspect-ratios
[
rescale_image_size
(
stop_sign
,
0.1
),
stop_sign
,
],
[
stop_sign
,
rescale_image_size
(
stop_sign
,
0.25
),
cherry_blossom
.
resize
((
183
,
488
)),
cherry_blossom
.
resize
((
488
,
183
))
],
cherry_blossom
,
])]
_run_test
(
hf_runner
,
vllm_runner
,
inputs
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
tests/models/test_minicpmv.py
View file @
0640f227
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
Union
import
pytest
import
torch
import
torch.types
from
PIL
import
Image
from
transformers
import
BatchEncoding
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
...
...
@@ -24,6 +25,11 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"(<image>./</image>)
\n
What is the season?<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
,
})
HF_MULTIIMAGE_IMAGE_PROMPT
=
\
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
(<image>./</image>)
\n
"
\
"Describe these images.<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
models
=
[
"openbmb/MiniCPM-Llama3-V-2_5"
]
...
...
@@ -46,13 +52,14 @@ target_dtype = "half"
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
inputs
:
List
[
Tuple
[
List
[
str
],
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]],
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
...
...
@@ -65,12 +72,6 @@ def run_test(
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
...
...
@@ -82,6 +83,7 @@ def run_test(
max_model_len
=
4096
,
max_num_seqs
=
1
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
...
...
@@ -93,7 +95,7 @@ def run_test(
num_logprobs
=
num_logprobs
,
images
=
images
,
stop_token_ids
=
stop_token_ids
)
for
prompts
,
images
in
inputs
_per_image
for
prompts
,
images
in
inputs
]
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
_wrap_inputs
)
...
...
@@ -104,7 +106,7 @@ def run_test(
num_logprobs
=
num_logprobs
,
images
=
images
,
tokenizer
=
tokenizer
)
for
prompts
,
images
in
inputs
_per_image
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
...
...
@@ -138,104 +140,26 @@ def run_test(
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
hf_runner
,
vllm_runner
,
i
mage_assets
,
i
nputs_per_image
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
HF_MULTIIMAGE_IMAGE_PROMPT
=
\
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>
\n\n
"
\
"(<image>./</image>)
\n
(<image>./</image>)
\n
"
\
"Describe these images.<|eot_id|>"
\
"<|start_header_id|>assistant<|end_header_id|>
\n\n
"
def
run_multi_image_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
max_model_len
=
4096
,
max_num_seqs
=
1
,
limit_mm_per_prompt
=
{
"image"
:
len
(
images
)},
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stop_token_ids
=
[
tokenizer
.
eos_id
,
tokenizer
.
eot_id
]
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
stop_token_ids
=
stop_token_ids
)
for
prompts
,
images
in
inputs_per_case
]
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
_wrap_inputs
)
with
hf_model
,
torch
.
no_grad
():
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
tokenizer
=
tokenizer
)
for
prompts
,
images
in
inputs_per_case
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
[
trunc_hf_output
(
hf_output
)
for
hf_output
in
hf_outputs
],
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
...
...
@@ -256,14 +180,22 @@ def run_multi_image_test(
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
run_multi_image_test
(
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
run_test
(
hf_runner
,
vllm_runner
,
i
mage_assets
,
i
nputs_per_case
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
tests/models/test_mistral.py
View file @
0640f227
...
...
@@ -30,9 +30,11 @@ def test_models(
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
...
...
tests/models/test_phi3v.py
View file @
0640f227
import
os
import
re
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
Union
import
pytest
from
PIL
import
Image
from
transformers
import
AutoTokenizer
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
,
is_hip
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
...
...
@@ -20,6 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"cherry_blossom"
:
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
})
HF_MULTIIMAGE_IMAGE_PROMPT
=
"<|user|>
\n
<|image_1|>
\n
<|image_2|>
\n
Describe these images.<|end|>
\n
<|assistant|>
\n
"
# noqa: E501
models
=
[
"microsoft/Phi-3.5-vision-instruct"
]
...
...
@@ -58,13 +60,14 @@ if is_hip():
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
image_assets
:
_ImageAssets
,
inputs
:
List
[
Tuple
[
List
[
str
],
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]]],
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
...
...
@@ -77,15 +80,6 @@ def run_test(
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
,
transpose
=
idx
)
for
idx
,
factor
in
enumerate
(
size_factors
)
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
...
...
@@ -97,15 +91,16 @@ def run_test(
max_model_len
=
4096
,
max_num_seqs
=
1
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_
imag
e
=
[
vllm_outputs_per_
cas
e
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
_per_image
for
prompts
,
images
in
inputs
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
...
...
@@ -113,17 +108,17 @@ def run_test(
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
eos_token_id
=
hf_model
.
processor
.
tokenizer
.
eos_token_id
hf_outputs_per_
imag
e
=
[
hf_outputs_per_
cas
e
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
eos_token_id
=
eos_token_id
)
for
prompts
,
images
in
inputs
_per_image
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_
imag
e
,
vllm_outputs_per_
imag
e
):
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_
cas
e
,
vllm_outputs_per_
cas
e
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
...
...
@@ -156,14 +151,86 @@ def run_test(
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
def
test_regression_7840
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_regresion_7840
=
[
([
prompt
],
[
image
])
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)
]
# Regression test for #7840.
run_test
(
hf_runner
,
vllm_runner
,
inputs_regresion_7840
,
model
,
dtype
=
dtype
,
max_tokens
=
128
,
num_logprobs
=
10
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
run_test
(
hf_runner
,
vllm_runner
,
i
mage_assets
,
i
nputs_per_case
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
tests/models/test_phimoe.py
0 → 100644
View file @
0640f227
"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
Run `pytest tests/models/test_phimoe.py`.
"""
import
pytest
import
torch
from
vllm.utils
import
is_cpu
from
.utils
import
check_logprobs_close
MODELS
=
[
"microsoft/Phi-3.5-MoE-instruct"
,
]
def
test_phimoe_routing_function
():
from
vllm.model_executor.models.phimoe
import
phimoe_routing_function
test_case
=
{
0
:
{
"hidden_states"
:
torch
.
tensor
([
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
],
dtype
=
torch
.
float32
,
requires_grad
=
False
).
view
(
4
,
2
),
"gating_output"
:
torch
.
tensor
([
0.1
,
0.2
,
0.3
,
0.4
],
dtype
=
torch
.
float32
,
requires_grad
=
False
),
"topk"
:
2
,
"renormalize"
:
False
,
},
1
:
{
"hidden_states"
:
torch
.
tensor
([
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
],
dtype
=
torch
.
float32
,
requires_grad
=
False
).
view
(
4
,
2
),
"gating_output"
:
torch
.
tensor
([
0.4
,
0.2
,
0.3
,
0.4
],
dtype
=
torch
.
float32
,
requires_grad
=
False
),
"topk"
:
2
,
"renormalize"
:
False
,
}
}
ground_truth
=
{
0
:
{
"topk_weights"
:
torch
.
tensor
([
1.
,
1.
],
dtype
=
torch
.
float32
,
requires_grad
=
False
),
"topk_ids"
:
torch
.
tensor
([
3
,
2
],
dtype
=
torch
.
long
,
requires_grad
=
False
),
},
1
:
{
"topk_weights"
:
torch
.
tensor
([
0.5
,
1.
],
dtype
=
torch
.
float32
,
requires_grad
=
False
),
"topk_ids"
:
torch
.
tensor
([
0
,
3
],
dtype
=
torch
.
long
,
requires_grad
=
False
),
}
}
for
test_id
in
test_case
:
topk_weights
,
topk_ids
=
phimoe_routing_function
(
**
test_case
[
test_id
])
assert
torch
.
allclose
(
topk_weights
,
ground_truth
[
test_id
][
"topk_weights"
])
assert
torch
.
equal
(
topk_ids
,
ground_truth
[
test_id
][
"topk_ids"
])
def
get_gpu_memory
():
try
:
props
=
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
())
gpu_memory
=
props
.
total_memory
/
(
1024
**
3
)
return
gpu_memory
except
Exception
:
return
0
@
pytest
.
mark
.
skipif
(
condition
=
is_cpu
(),
reason
=
"This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model."
)
@
pytest
.
mark
.
skipif
(
condition
=
get_gpu_memory
()
<
100
,
reason
=
"Skip this test if GPU memory is insufficient."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/models/test_ultravox.py
View file @
0640f227
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
librosa
import
numpy
as
np
import
pytest
from
transformers
import
AutoModel
,
AutoTokenizer
,
BatchEncoding
from
vllm.assets.audio
import
AudioAsset
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
...
...
@@ -18,36 +16,32 @@ MODEL_NAME = "fixie-ai/ultravox-v0_3"
AudioTuple
=
Tuple
[
np
.
ndarray
,
int
]
VLLM_PLACEHOLDER
=
"<|reserved_special_token_0|>"
HF_PLACEHOLDER
=
"<|audio|>"
@
pytest
.
fixture
(
scope
=
"session"
)
def
audio_and_sample_rate
():
return
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
def
audio_assets
():
from
vllm.assets.audio
import
AudioAsset
return
[
AudioAsset
(
"mary_had_lamb"
),
AudioAsset
(
"winning_call"
)]
@
pytest
.
fixture
def
prompts_and_audios
(
audio_and_sample_rate
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
(
"mary_had_lamb"
,
"winning_call"
))
def
audio
(
request
):
from
vllm.assets.audio
import
AudioAsset
return
AudioAsset
(
request
.
param
)
vllm_placeholder
=
"<|reserved_special_token_0|>"
hf_placeholder
=
"<|audio|>"
question
=
"What's in the audio?"
vllm_prompt
=
tokenizer
.
apply_chat_template
(
[{
'role'
:
'user'
,
'content'
:
f
"
{
vllm_placeholder
}
\n
{
question
}
"
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
hf_prompt
=
tokenizer
.
apply_chat_template
(
[{
'role'
:
'user'
,
'content'
:
f
"
{
hf_placeholder
}
\n
{
question
}
"
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
def
_get_prompt
(
audio_count
,
question
,
placeholder
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
placeholder
=
f
"
{
placeholder
}
\n
"
*
audio_count
return
[(
vllm_prompt
,
hf_prompt
,
audio_and_sample_rate
)]
return
tokenizer
.
apply_chat_template
([{
'role'
:
'user'
,
'content'
:
f
"
{
placeholder
}{
question
}
"
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
...
...
@@ -109,6 +103,7 @@ def run_test(
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModel
)
as
hf_model
:
import
librosa
hf_outputs_per_audio
=
[
hf_model
.
generate_greedy_logprobs_limit
(
...
...
@@ -134,15 +129,71 @@ def run_test(
)
def
run_multi_audio_test
(
vllm_runner
:
Type
[
VllmRunner
],
prompts_and_audios
:
List
[
Tuple
[
str
,
List
[
AudioTuple
]]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"audio"
:
max
((
len
(
audio
)
for
_
,
audio
in
prompts_and_audios
))
})
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
[
prompt
for
prompt
,
_
in
prompts_and_audios
],
max_tokens
,
num_logprobs
=
num_logprobs
,
audios
=
[
audios
for
_
,
audios
in
prompts_and_audios
])
# The HuggingFace model doesn't support multiple audios yet, so
# just assert that some tokens were generated.
assert
all
(
tokens
for
tokens
,
*
_
in
vllm_outputs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
prompts_and_audios
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
def
test_models
(
hf_runner
,
vllm_runner
,
audio
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
vllm_prompt
=
_get_prompt
(
1
,
"Describe the audio above."
,
VLLM_PLACEHOLDER
)
hf_prompt
=
_get_prompt
(
1
,
"Describe the audio above."
,
HF_PLACEHOLDER
)
run_test
(
hf_runner
,
vllm_runner
,
prompts_and_audios
,
[(
vllm_prompt
,
hf_prompt
,
audio
.
audio_and_sample_rate
)],
MODEL_NAME
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_with_multiple_audios
(
vllm_runner
,
audio_assets
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
vllm_prompt
=
_get_prompt
(
len
(
audio_assets
),
"Describe each of the audios above."
,
VLLM_PLACEHOLDER
)
run_multi_audio_test
(
vllm_runner
,
[(
vllm_prompt
,
[
audio
.
audio_and_sample_rate
for
audio
in
audio_assets
])],
MODEL_NAME
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
...
...
tests/models/utils.py
View file @
0640f227
import
warnings
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
Logprob
,
SampleLogprobs
TokensText
=
Tuple
[
List
[
int
],
str
]
...
...
@@ -38,34 +38,39 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
float
]],
SampleLogprobs
]]]
# Allow for tokens to be represented as str's rather than IDs
TextTextLogprobs
=
Tuple
[
List
[
str
],
str
,
Optional
[
Union
[
List
[
Dict
[
str
,
float
]],
List
[
Dict
[
str
,
Logprob
]]]]]
def
check_logprobs_close
(
*
,
outputs_0_lst
:
Sequence
[
TokensTextLogprobs
],
outputs_1_lst
:
Sequence
[
TokensTextLogprobs
],
outputs_0_lst
:
Sequence
[
Union
[
TokensTextLogprobs
,
TextTextLogprobs
]
],
outputs_1_lst
:
Sequence
[
Union
[
TokensTextLogprobs
,
TextTextLogprobs
]
],
name_0
:
str
,
name_1
:
str
,
num_outputs_0_skip_tokens
:
int
=
0
,
warn_on_mismatch
:
bool
=
True
,
):
"""
Compare the logprobs of two sequences generated by different models,
always_check_logprobs
:
bool
=
False
,
)
->
None
:
"""
Compare the logprobs of two sequences generated by different models,
which should be similar but not necessarily equal.
Arguments:
* outputs_0_lst: First sequence to compare
* outputs_0_lst: Second sequence to compare
* name_0: sequence #0 name
* name_1: sequence #1 name
* num_outputs_0_skip_tokens: If > 0, specifies the number of initial
Args:
outputs_0_lst: First sequence to compare
outputs_0_lst: Second sequence to compare
name_0: sequence #0 name
name_1: sequence #1 name
num_outputs_0_skip_tokens: If > 0, specifies the number of initial
sequence #0 tokens & logprobs to discard
before comparison, i.e. all
of sequence #1 will be compared to
sequence #0 beginning at index
num_outputs_0_skip_tokens
*
warn_on_mismatch: Issue a warning if there is token-wise or text-wise
warn_on_mismatch: Issue a warning if there is token-wise or text-wise
mismatch between the two sequences
always_check_logprobs: If true, check logprobs even when tokens match
"""
assert
len
(
outputs_0_lst
)
==
len
(
outputs_1_lst
)
...
...
@@ -94,8 +99,12 @@ def check_logprobs_close(
for
idx
,
(
output_id_0
,
output_id_1
)
in
enumerate
(
zip
(
output_ids_0
,
output_ids_1
)):
# If generated tokens don't match, then
if
output_id_0
!=
output_id_1
:
is_tok_mismatch
=
output_id_0
!=
output_id_1
# If generated tokens don't match
# or it is desired to always check logprobs,
# then
if
is_tok_mismatch
or
always_check_logprobs
:
logprobs_elem_0
=
logprobs_0
[
idx
]
logprobs_elem_1
=
logprobs_1
[
idx
]
...
...
@@ -111,7 +120,7 @@ def check_logprobs_close(
assert
output_id_0
in
logprobs_elem_1
,
fail_msg
assert
output_id_1
in
logprobs_elem_0
,
fail_msg
if
warn_on_mismatch
:
if
warn_on_mismatch
and
is_tok_mismatch
:
with
warnings
.
catch_warnings
():
# This ensures that repeated warnings are shown
# in the output, not just the first occurrence
...
...
tests/multi_step/test_correctness.py
→
tests/multi_step/test_correctness
_async_llm
.py
View file @
0640f227
# Test the AsyncLLMEngine with multi-step-decoding
from
typing
import
List
from
typing
import
List
,
Optional
import
pytest
from
..utils
import
RemoteOpenAIServer
from
..models.utils
import
check_logprobs_close
from
..utils
import
(
completions_with_server_args
,
get_client_text_generations
,
get_client_text_logprob_generations
)
MODELS
=
[
"JackFram/llama-160m"
,
...
...
@@ -23,22 +25,6 @@ DEFAULT_SERVER_ARGS: List[str] = [
]
async
def
completions_with_server_args
(
prompts
:
List
[
str
],
model_name
:
str
,
server_cli_args
:
List
[
str
]):
outputs
=
None
with
RemoteOpenAIServer
(
model_name
,
server_cli_args
)
as
server
:
client
=
server
.
get_async_client
()
outputs
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
temperature
=
0
,
stream
=
False
,
max_tokens
=
5
)
assert
outputs
is
not
None
return
outputs
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
((
"tp_size, pp_size"
),
[
(
1
,
1
),
...
...
@@ -47,10 +33,43 @@ async def completions_with_server_args(prompts: List[str], model_name: str,
@
pytest
.
mark
.
parametrize
(
"eager_mode"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
@
pytest
.
mark
.
parametrize
(
"num_prompts"
,
NUM_PROMPTS
)
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
None
,
5
])
@
pytest
.
mark
.
parametrize
(
"is_async"
,
[
False
,
True
])
@
pytest
.
mark
.
asyncio
async
def
test_multi_step
(
example_prompts
,
model
:
str
,
tp_size
:
int
,
pp_size
:
int
,
eager_mode
:
int
,
num_scheduler_steps
:
int
,
num_prompts
:
int
):
async
def
test_multi_step
(
example_prompts
,
model
:
str
,
tp_size
:
int
,
pp_size
:
int
,
eager_mode
:
int
,
num_scheduler_steps
:
int
,
num_prompts
:
int
,
is_async
:
bool
,
num_logprobs
:
Optional
[
int
],
)
->
None
:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
client/server environment.
Set up an engine with single-step scheduling as a ground-truth reference.
Send a completions API request to both engines with the same prompts.
Validate:
* Generated tokens match
* Generated logprobs are all very close
Args:
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
tp_size: degree of tensor-parallelism
pp_size: degree of pipeline-parallelism
eager_mode
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
num_prompts: number of example prompts under test
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> no logprobs
"""
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
...
...
@@ -62,6 +81,9 @@ async def test_multi_step(example_prompts, model: str, tp_size: int,
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
if
not
is_async
:
ms_server_args
+=
[
"--disable-async-output-proc"
]
if
eager_mode
:
ms_server_args
.
append
(
"--enforce-eager"
)
...
...
@@ -72,14 +94,36 @@ async def test_multi_step(example_prompts, model: str, tp_size: int,
str
(
pp_size
),
]
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions
=
await
completions_with_server_args
(
prompts
,
model
,
server_args
+
distributed_args
)
prompts
,
model
,
server_args
+
distributed_args
,
num_logprobs
,
max_wait_seconds
=
5
*
240
)
test_completions
=
await
completions_with_server_args
(
prompts
,
model
,
ms_server_args
+
distributed_args
)
def
get_text_generations
(
completions
):
return
[
x
.
text
for
x
in
completions
.
choices
]
ref_generations
=
get_text_generations
(
ref_completions
)
test_generations
=
get_text_generations
(
test_completions
)
prompts
,
model
,
ms_server_args
+
distributed_args
,
num_logprobs
,
max_wait_seconds
=
5
*
240
)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations
=
get_client_text_generations
(
ref_completions
)
test_generations
=
get_client_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
# Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling.
ref_text_logprobs
=
get_client_text_logprob_generations
(
ref_completions
)
test_text_logprobs
=
get_client_text_logprob_generations
(
test_completions
)
check_logprobs_close
(
outputs_0_lst
=
ref_text_logprobs
,
outputs_1_lst
=
test_text_logprobs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/multi_step/test_correctness_llm.py
0 → 100644
View file @
0640f227
# Test the LLMEngine with multi-step-decoding
from
typing
import
Optional
import
pytest
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
MODELS
=
[
"JackFram/llama-160m"
,
]
NUM_SCHEDULER_STEPS
=
[
8
]
# Multi-step decoding steps
NUM_PROMPTS
=
[
10
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
@
pytest
.
mark
.
parametrize
(
"num_prompts"
,
NUM_PROMPTS
)
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
None
,
5
])
def
test_multi_step_llm
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
tp_size
:
int
,
max_tokens
:
int
,
enforce_eager
:
int
,
num_scheduler_steps
:
int
,
num_prompts
:
int
,
num_logprobs
:
Optional
[
int
],
)
->
None
:
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
Set up a HuggingFace (HF) transformers model as a ground-truth reference.
Prompt them with the same example prompts.
Validate:
* Generated tokens match
* Generated logprobs are all very close
Args:
hf_runner: HF transformers model runner fixture
vllm_runner: vLLM model runner fixture
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism
max_tokens: the maximum number of tokens to generate
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
num_prompts: number of example prompts under test
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> no logprobs
"""
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
use_v2_block_manager
=
True
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
vllm_outputs
=
(
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
))
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_greedy
(
prompts
,
max_tokens
)
if
num_logprobs
is
None
else
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
))
if
num_logprobs
is
None
:
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
else
:
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/multimodal/test_base.py
0 → 100644
View file @
0640f227
import
torch
from
vllm.multimodal.base
import
MultiModalInputs
,
NestedTensors
def
assert_nested_tensors_equal
(
expected
:
NestedTensors
,
actual
:
NestedTensors
):
assert
type
(
expected
)
==
type
(
actual
)
if
isinstance
(
expected
,
torch
.
Tensor
):
assert
torch
.
equal
(
expected
,
actual
)
else
:
for
expected_item
,
actual_item
in
zip
(
expected
,
actual
):
assert_nested_tensors_equal
(
expected_item
,
actual_item
)
def
assert_multimodal_inputs_equal
(
expected
:
MultiModalInputs
,
actual
:
MultiModalInputs
):
assert
set
(
expected
.
keys
())
==
set
(
actual
.
keys
())
for
key
in
expected
:
assert_nested_tensors_equal
(
expected
[
key
],
actual
[
key
])
def
test_multimodal_input_batch_single_tensor
():
t
=
torch
.
rand
([
1
,
2
])
result
=
MultiModalInputs
.
batch
([{
"image"
:
t
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
t
.
unsqueeze
(
0
)})
def
test_multimodal_input_batch_multiple_tensors
():
a
=
torch
.
rand
([
1
,
1
,
2
])
b
=
torch
.
rand
([
1
,
1
,
2
])
c
=
torch
.
rand
([
1
,
1
,
2
])
result
=
MultiModalInputs
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
a
,
b
,
c
])})
def
test_multimodal_input_batch_multiple_heterogeneous_tensors
():
a
=
torch
.
rand
([
1
,
2
,
2
])
b
=
torch
.
rand
([
1
,
3
,
2
])
c
=
torch
.
rand
([
1
,
4
,
2
])
result
=
MultiModalInputs
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
a
,
b
,
c
]})
def
test_multimodal_input_batch_nested_tensors
():
a
=
torch
.
rand
([
2
,
3
])
b
=
torch
.
rand
([
2
,
3
])
c
=
torch
.
rand
([
2
,
3
])
result
=
MultiModalInputs
.
batch
([{
"image"
:
[
a
]
},
{
"image"
:
[
b
]
},
{
"image"
:
[
c
]
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
a
.
unsqueeze
(
0
),
b
.
unsqueeze
(
0
),
c
.
unsqueeze
(
0
)])
})
def
test_multimodal_input_batch_heterogeneous_lists
():
a
=
torch
.
rand
([
1
,
2
,
3
])
b
=
torch
.
rand
([
1
,
2
,
3
])
c
=
torch
.
rand
([
1
,
2
,
3
])
result
=
MultiModalInputs
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
torch
.
stack
([
a
,
b
]),
c
.
unsqueeze
(
0
)]})
def
test_multimodal_input_batch_multiple_batchable_lists
():
a
=
torch
.
rand
([
1
,
2
,
3
])
b
=
torch
.
rand
([
1
,
2
,
3
])
c
=
torch
.
rand
([
1
,
2
,
3
])
d
=
torch
.
rand
([
1
,
2
,
3
])
result
=
MultiModalInputs
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
,
d
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
torch
.
stack
([
a
,
b
]),
torch
.
stack
([
c
,
d
])])})
def
test_multimodal_input_batch_mixed_stacking_depths
():
a
=
torch
.
rand
([
1
,
2
,
3
])
b
=
torch
.
rand
([
1
,
3
,
3
])
c
=
torch
.
rand
([
1
,
4
,
3
])
result
=
MultiModalInputs
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[[
a
,
b
],
c
.
unsqueeze
(
0
)]})
result
=
MultiModalInputs
.
batch
([{
"image"
:
[
a
]},
{
"image"
:
[
b
,
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
a
.
unsqueeze
(
0
),
[
b
,
c
]]})
Prev
1
2
3
4
5
6
7
8
9
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment