Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
539aa992
Commit
539aa992
authored
Sep 27, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.2' into v0.6.2-dev
parents
93872128
7193774b
Changes
383
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
882 additions
and
375 deletions
+882
-375
tests/kernels/test_moe.py
tests/kernels/test_moe.py
+73
-8
tests/kernels/test_permute_cols.py
tests/kernels/test_permute_cols.py
+15
-0
tests/kernels/test_pos_encoding.py
tests/kernels/test_pos_encoding.py
+5
-9
tests/kernels/test_prefix_prefill.py
tests/kernels/test_prefix_prefill.py
+3
-9
tests/kernels/test_rand.py
tests/kernels/test_rand.py
+0
-52
tests/kernels/test_rotary_embedding.py
tests/kernels/test_rotary_embedding.py
+62
-0
tests/kernels/test_sampler.py
tests/kernels/test_sampler.py
+0
-209
tests/kernels/test_utils.py
tests/kernels/test_utils.py
+24
-0
tests/kernels/utils.py
tests/kernels/utils.py
+37
-6
tests/lora/conftest.py
tests/lora/conftest.py
+1
-4
tests/lora/test_layers.py
tests/lora/test_layers.py
+2
-3
tests/lora/test_punica_sizes.py
tests/lora/test_punica_sizes.py
+10
-13
tests/lora/test_punica_variation.py
tests/lora/test_punica_variation.py
+10
-13
tests/models/decoder_only/language/test_big_models.py
tests/models/decoder_only/language/test_big_models.py
+9
-6
tests/models/decoder_only/language/test_granite.py
tests/models/decoder_only/language/test_granite.py
+2
-7
tests/models/decoder_only/language/test_mistral.py
tests/models/decoder_only/language/test_mistral.py
+91
-0
tests/models/decoder_only/vision_language/test_llava_next_video.py
...els/decoder_only/vision_language/test_llava_next_video.py
+0
-3
tests/models/decoder_only/vision_language/test_llava_onevision.py
...dels/decoder_only/vision_language/test_llava_onevision.py
+356
-0
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+181
-5
tests/models/decoder_only/vision_language/test_qwen.py
tests/models/decoder_only/vision_language/test_qwen.py
+1
-28
No files found.
tests/kernels/test_moe.py
View file @
539aa992
...
...
@@ -9,15 +9,19 @@ import torch
from
transformers
import
MixtralConfig
from
transformers.models.mixtral.modeling_mixtral
import
MixtralSparseMoeBlock
from
tests.kernels.utils
import
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.fused_moe.fused_marlin_moe
import
(
fused_marlin_moe
,
single_marlin_moe
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_topk
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
fused_topk
,
moe_align_block_size
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test
import
(
marlin_quantize
)
from
vllm.model_executor.models.mixtral
import
MixtralMoE
from
vllm.scalar_type
import
scalar_types
from
vllm.utils
import
seed_everything
def
torch_moe
(
a
,
w1
,
w2
,
score
,
topk
):
...
...
@@ -140,6 +144,7 @@ def compute_max_diff(output, output_ref):
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
6
])
@
pytest
.
mark
.
parametrize
(
"group_size"
,
[
-
1
,
32
,
64
,
128
])
@
pytest
.
mark
.
parametrize
(
"act_order"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
[
4
,
8
])
def
test_fused_marlin_moe
(
m
:
int
,
n
:
int
,
...
...
@@ -148,8 +153,9 @@ def test_fused_marlin_moe(
topk
:
int
,
group_size
:
int
,
act_order
:
bool
,
num_bits
:
int
,
):
torch
.
manual_seed
(
7
)
seed_everything
(
7
)
if
topk
>
e
:
return
...
...
@@ -161,13 +167,12 @@ def test_fused_marlin_moe(
if
group_size
in
(
k
,
n
):
return
quant_type
=
scalar_types
.
uint4b8
quant_type
=
(
scalar_types
.
uint4b8
if
num_bits
==
4
else
scalar_types
.
uint8b128
)
dtype
=
torch
.
float16
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
for
i
in
range
(
w2
.
shape
[
0
]):
w2
[
0
]
=
torch
.
eye
(
k
,
n
,
device
=
"cuda"
,
dtype
=
dtype
)
w_ref1_l
=
[]
qweight1_l
=
[]
...
...
@@ -240,10 +245,40 @@ def test_fused_marlin_moe(
topk_ids
,
w1_scale
=
scales1
,
w2_scale
=
scales2
,
num_bits
=
num_bits
,
)
assert
compute_max_diff
(
marlin_output
,
triton_output
)
<
4e-2
if
ops
.
supports_moe_ops
:
token_expert_indicies
=
torch
.
empty
(
m
,
topk
,
dtype
=
torch
.
int32
,
device
=
a
.
device
)
opcheck
(
torch
.
ops
.
_moe_C
.
topk_softmax
,
(
topk_weights
,
topk_ids
,
token_expert_indicies
,
score
.
float
(),
))
block_size_m
=
4
sorted_token_ids
,
_
,
_
=
moe_align_block_size
(
topk_ids
,
block_size_m
,
e
)
max_workspace_size
=
((
m
+
255
)
//
256
)
*
(
max
(
2
*
n
,
k
)
//
64
)
*
16
workspace
=
torch
.
zeros
(
max_workspace_size
,
dtype
=
torch
.
int
,
device
=
"cuda"
,
requires_grad
=
False
)
opcheck
(
torch
.
ops
.
_moe_C
.
marlin_gemm_moe
,
(
a
,
qweight1
,
sorted_token_ids
,
topk_weights
,
topk_ids
,
scales1
,
g_idx1
,
sort_indices1
,
workspace
,
quant_type
,
m
,
2
*
n
,
k
,
True
,
e
,
topk
,
block_size_m
,
True
,
False
))
@
pytest
.
mark
.
skip
(
"This test is here for the sake of debugging, "
"don't run it in automated tests."
)
...
...
@@ -254,7 +289,8 @@ def test_fused_marlin_moe(
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
6
])
@
pytest
.
mark
.
parametrize
(
"group_size"
,
[
-
1
,
32
,
64
,
128
])
@
pytest
.
mark
.
parametrize
(
"act_order"
,
[
True
,
False
])
def
test_marlin_moe_mmm
(
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
[
4
,
8
])
def
test_single_marlin_moe_multiply
(
m
:
int
,
n
:
int
,
k
:
int
,
...
...
@@ -262,6 +298,7 @@ def test_marlin_moe_mmm(
topk
:
int
,
group_size
:
int
,
act_order
:
bool
,
num_bits
:
int
,
):
if
topk
>
e
:
return
...
...
@@ -273,7 +310,8 @@ def test_marlin_moe_mmm(
if
group_size
==
k
:
return
quant_type
=
scalar_types
.
uint4b8
quant_type
=
(
scalar_types
.
uint4b8
if
num_bits
==
4
else
scalar_types
.
uint8b128
)
dtype
=
torch
.
float16
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w
=
torch
.
randn
((
e
,
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
...
...
@@ -308,7 +346,34 @@ def test_marlin_moe_mmm(
g_idx
,
sort_indices
,
topk
,
renormalize
=
False
)
renormalize
=
False
,
num_bits
=
num_bits
)
torch_output
=
torch_moe_single
(
a
,
w_ref
.
transpose
(
1
,
2
),
score
,
topk
)
assert
compute_max_diff
(
marlin_output
,
torch_output
)
<
1e-2
def
test_moe_align_block_size_opcheck
():
num_experts
=
4
block_size
=
4
topk_ids
=
torch
.
randint
(
0
,
num_experts
,
(
3
,
4
),
dtype
=
torch
.
int32
,
device
=
'cuda'
)
max_num_tokens_padded
=
topk_ids
.
numel
()
+
num_experts
*
(
block_size
-
1
)
sorted_ids
=
torch
.
empty
((
max_num_tokens_padded
,
),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
sorted_ids
.
fill_
(
topk_ids
.
numel
())
max_num_m_blocks
=
max_num_tokens_padded
//
block_size
expert_ids
=
torch
.
empty
((
max_num_m_blocks
,
),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
num_tokens_post_pad
=
torch
.
empty
((
1
),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
opcheck
(
torch
.
ops
.
_C
.
moe_align_block_size
,
(
topk_ids
,
num_experts
,
block_size
,
sorted_ids
,
expert_ids
,
num_tokens_post_pad
))
tests/kernels/test_permute_cols.py
0 → 100644
View file @
539aa992
import
pytest
import
torch
from
tests.kernels.utils
import
opcheck
from
vllm._custom_ops
import
permute_cols
@
pytest
.
mark
.
parametrize
(
'shape'
,
[(
1
,
512
),
(
544
,
4096
),
(
67
,
8192
)])
@
pytest
.
mark
.
parametrize
(
'dtype'
,
[
torch
.
bfloat16
,
torch
.
float16
])
def
test_permute_cols
(
shape
,
dtype
):
x
=
torch
.
randn
(
shape
,
dtype
=
dtype
).
cuda
()
perm
=
torch
.
randperm
(
x
.
shape
[
1
]).
to
(
torch
.
int
).
cuda
()
opcheck
(
torch
.
ops
.
_C
.
permute_cols
,
(
x
,
perm
))
y
=
permute_cols
(
x
,
perm
)
torch
.
testing
.
assert_close
(
y
,
x
[:,
perm
])
\ No newline at end of file
tests/kernels/test_pos_encoding.py
View file @
539aa992
...
...
@@ -5,6 +5,7 @@ import pytest
import
torch
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.utils
import
seed_everything
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
...
@@ -46,9 +47,8 @@ def test_rotary_embedding(
)
->
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
rotary_dim
=
head_size
...
...
@@ -100,9 +100,7 @@ def test_batched_rotary_embedding(
max_position
:
int
=
8192
,
base
:
int
=
10000
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
rotary_dim
=
head_size
...
...
@@ -162,9 +160,7 @@ def test_batched_rotary_embedding_multi_lora(
max_position
:
int
=
8192
,
base
:
int
=
10000
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
rotary_dim
=
head_size
...
...
tests/kernels/test_prefix_prefill.py
View file @
539aa992
...
...
@@ -10,7 +10,7 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
from
vllm.attention.backends.xformers
import
_make_alibi_bias
from
vllm.attention.ops.prefix_prefill
import
context_attention_fwd
from
vllm.utils
import
is_hip
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
seed_everything
NUM_HEADS
=
[
64
]
NUM_QUERIES_PER_KV
=
[
1
,
8
,
64
]
...
...
@@ -40,10 +40,7 @@ def test_contexted_kv_attention(
kv_cache_dtype
:
str
,
device
:
str
,
)
->
None
:
random
.
seed
(
0
)
torch
.
manual_seed
(
0
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
0
)
seed_everything
(
0
)
torch
.
set_default_device
(
device
)
# Need this, otherwise when we capture the graph the process
...
...
@@ -239,10 +236,7 @@ def test_contexted_kv_attention_alibi(
kv_cache_dtype
:
str
,
device
:
str
,
)
->
None
:
random
.
seed
(
0
)
torch
.
manual_seed
(
0
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
0
)
seed_everything
(
0
)
torch
.
set_default_device
(
device
)
# Need this, otherwise when we capture the graph the process
...
...
tests/kernels/test_rand.py
deleted
100644 → 0
View file @
93872128
import
random
import
pytest
import
torch
from
vllm.model_executor.layers.ops.rand
import
seeded_uniform
from
vllm.model_executor.utils
import
set_random_seed
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"use_3d"
,
[
True
,
False
])
def
test_seeded_uniform
(
dtype
:
torch
.
dtype
,
use_3d
:
bool
):
device
=
"cuda"
for
seed
in
range
(
512
):
set_random_seed
(
seed
)
rows
=
random
.
randint
(
1
,
512
)
cols
=
random
.
randint
(
1
,
64000
)
if
use_3d
:
third_dim
=
random
.
randint
(
2
,
10
)
dims
=
[
rows
,
third_dim
,
cols
]
else
:
dims
=
[
rows
,
cols
]
seeds
=
torch
.
randint
(
torch
.
iinfo
(
torch
.
long
).
min
,
torch
.
iinfo
(
torch
.
long
).
max
,
(
rows
,
),
device
=
device
)
# Test that the same seed produces the same output
out
=
seeded_uniform
(
*
dims
,
seeds
=
seeds
,
dtype
=
dtype
,
device
=
device
)
out2
=
seeded_uniform
(
*
dims
,
seeds
=
seeds
,
dtype
=
dtype
,
device
=
device
)
torch
.
testing
.
assert_close
(
out
,
out2
)
# del to save memory
del
out2
out3
=
seeded_uniform
(
*
dims
,
seeds
=
seeds
,
dtype
=
dtype
,
device
=
device
)
torch
.
testing
.
assert_close
(
out
,
out3
)
# del to save memory
del
out3
# Initialize out tensor with garbage to ensure that it is overwritten
out_with_tensor
=
seeded_uniform
(
*
dims
,
out
=
torch
.
full
(
(
*
dims
,
),
-
1
,
dtype
=
dtype
,
device
=
device
,
),
seeds
=
seeds
,
dtype
=
dtype
,
)
torch
.
testing
.
assert_close
(
out
,
out_with_tensor
)
tests/kernels/test_rotary_embedding.py
0 → 100644
View file @
539aa992
"""
Tests for miscellaneous utilities
"""
from
typing
import
Optional
import
pytest
import
torch
from
tests.kernels.utils
import
opcheck
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
def
rotary_embedding_opcheck
(
rot
,
positions
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
offsets
:
Optional
[
torch
.
Tensor
]
=
None
):
cos_sin_cache
=
rot
.
cos_sin_cache
.
to
(
query
.
device
,
dtype
=
query
.
dtype
)
# ops.rotary_embedding()/batched_rotary_embedding()
# are in-place operations that update the query and key tensors.
if
offsets
is
not
None
:
opcheck
(
torch
.
ops
.
_C
.
batched_rotary_embedding
,
(
positions
,
query
,
key
,
rot
.
head_size
,
cos_sin_cache
,
rot
.
is_neox_style
,
rot
.
rotary_dim
,
offsets
))
else
:
opcheck
(
torch
.
ops
.
_C
.
rotary_embedding
,
(
positions
,
query
,
key
,
rot
.
head_size
,
cos_sin_cache
,
rot
.
is_neox_style
))
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"max_position"
,
[
11
,
4096
,
32768
])
@
pytest
.
mark
.
parametrize
(
"is_neox_style"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"rotary_dim"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
32
,
108
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
11
,
1024
])
def
test_rotary_embedding_opcheck
(
dist_init
,
device
,
max_position
,
is_neox_style
,
rotary_dim
,
head_size
,
seq_len
):
batch_size
=
1
base
=
0
num_heads
=
7
rot
=
RotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
torch
.
float32
)
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
),
device
=
device
)
query
=
torch
.
randn
(
batch_size
,
seq_len
,
num_heads
*
head_size
,
dtype
=
torch
.
float32
,
device
=
device
)
key
=
torch
.
randn_like
(
query
)
rotary_embedding_opcheck
(
rot
,
positions
,
query
,
key
)
offsets
=
torch
.
zeros
(
batch_size
*
seq_len
,
device
=
device
,
dtype
=
torch
.
long
)
rotary_embedding_opcheck
(
rot
,
positions
,
query
,
key
,
offsets
)
tests/kernels/test_sampler.py
deleted
100644 → 0
View file @
93872128
import
gc
from
unittest.mock
import
patch
import
pytest
import
torch
import
triton
import
triton.language
as
tl
from
vllm.model_executor.layers.ops.sample
import
(
_sample_triton
,
_uniform_to_exponential
,
sample
)
from
vllm.model_executor.sampling_metadata
import
SamplingTensors
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.triton_utils.libentry
import
LibEntry
from
vllm.triton_utils.sample
import
(
MAX_TRITON_N_COLS
,
get_num_triton_sampler_splits
)
SINGLE_SPLIT_VOCAB_SIZE
=
32000
# llama/mistral/mixtral vocab size
MULTI_SPLIT_VOCAB_SIZE
=
MAX_TRITON_N_COLS
+
100
@
pytest
.
fixture
(
autouse
=
True
)
def
_cleanup
():
yield
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
triton
.
jit
def
_uniform_to_exponential_kernel
(
input
,
output
,
n
:
tl
.
constexpr
):
idx
=
tl
.
arange
(
0
,
n
)
x
=
tl
.
load
(
input
+
idx
)
y
=
_uniform_to_exponential
(
x
)
tl
.
store
(
output
+
idx
,
y
)
def
test_uniform_to_exponential
():
"""Test that we can convert uniform to exponential without div by 0."""
input
=
torch
.
tensor
([
0.0
,
1.0
-
torch
.
finfo
(
torch
.
float32
).
eps
],
dtype
=
torch
.
float32
,
device
=
"cuda"
)
output
=
torch
.
zeros
(
input
.
shape
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
_uniform_to_exponential_kernel
[(
1
,
)](
input
,
output
,
2
)
assert
torch
.
all
(
torch
.
isfinite
(
output
))
assert
torch
.
all
(
output
>
0
)
assert
torch
.
all
(
torch
.
isfinite
(
torch
.
full_like
(
output
,
1.0
)
/
output
))
@
pytest
.
mark
.
parametrize
(
"random_sampling"
,
[
True
,
False
,
"mixed"
])
@
pytest
.
mark
.
parametrize
(
"max_best_of"
,
[
1
,
2
,
3
,
4
,
5
])
@
pytest
.
mark
.
parametrize
(
"modify_greedy_probs"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1337
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
SINGLE_SPLIT_VOCAB_SIZE
,
MULTI_SPLIT_VOCAB_SIZE
])
@
pytest
.
mark
.
parametrize
(
"save_logprobs"
,
[
True
,
False
])
def
test_sample_decoding_only
(
random_sampling
,
max_best_of
,
modify_greedy_probs
,
seed
,
vocab_size
,
save_logprobs
):
set_random_seed
(
seed
)
bs
=
8
probs
=
torch
.
zeros
((
bs
,
vocab_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
)
for
i
in
range
(
bs
):
probs
[
i
,
i
*
(
vocab_size
//
bs
)]
=
1.0
logprobs
=
torch
.
rand_like
(
probs
)
sample_indices
=
torch
.
arange
(
bs
,
dtype
=
torch
.
long
,
device
=
"cuda"
)
n_splits
=
get_num_triton_sampler_splits
(
probs
.
shape
[
1
])
if
random_sampling
==
"mixed"
:
random_sampling_mask
=
(
torch
.
rand
(
(
1
,
bs
),
device
=
"cuda"
)
<
0.5
).
expand
(
n_splits
,
bs
)
elif
random_sampling
:
random_sampling_mask
=
torch
.
ones
((
n_splits
,
bs
),
dtype
=
torch
.
bool
,
device
=
"cuda"
)
else
:
random_sampling_mask
=
torch
.
zeros
((
n_splits
,
bs
),
dtype
=
torch
.
bool
,
device
=
"cuda"
)
seeds
=
torch
.
randint
(
1
,
torch
.
iinfo
(
torch
.
long
).
max
,
(
n_splits
,
bs
),
device
=
"cuda"
).
mul_
(
random_sampling_mask
)
#The current _sample_triton does not utilize the
# libentry decoration. The purpose of adding this patch is to test
# the correctness of libentry.
with
patch
(
"vllm.model_executor.layers.ops.sample._sample_triton"
,
LibEntry
(
_sample_triton
)):
sampled_tokens
,
sampled_logprobs
,
sampled_modified_probs
=
sample
(
probs
=
probs
,
logprobs
=
logprobs
,
sample_indices
=
sample_indices
,
seeds
=
seeds
,
max_best_of
=
max_best_of
,
modify_greedy_probs
=
modify_greedy_probs
,
save_logprobs
=
save_logprobs
,
_save_modified_probs
=
True
)
assert
sampled_tokens
.
shape
==
(
bs
,
max_best_of
)
for
i
in
range
(
bs
):
assert
torch
.
all
(
sampled_tokens
[
i
]
==
i
*
(
vocab_size
//
bs
))
request_uses_random_sampling
=
random_sampling_mask
[
0
,
i
]
if
modify_greedy_probs
and
not
request_uses_random_sampling
:
# If we are modifying greedy probs and the request is greedy,
# we want to make sure the probs tensor is modified in place
torch
.
testing
.
assert_close
(
probs
[
i
][
sampled_tokens
[
i
]],
torch
.
full_like
(
probs
[
i
][
sampled_tokens
[
i
]],
1.0
))
assert
torch
.
sum
(
probs
[
i
])
==
1.0
torch
.
testing
.
assert_close
(
sampled_modified_probs
[
i
][
0
],
torch
.
full_like
(
sampled_modified_probs
[
i
][
0
],
1.0
))
elif
request_uses_random_sampling
:
# If the request is random, we want to make sure
# sampled_modified_probs tensor has noise added
# (and thus is different from probs tensor)
assert
not
torch
.
allclose
(
sampled_modified_probs
[
i
][
0
],
probs
[
i
][
sampled_tokens
[
i
]])
elif
not
request_uses_random_sampling
:
# If the request is greedy and we are not modifying greedy probs,
# we want to make sure sampled_modified_probs tensor is the same as
# the probs tensor.
torch
.
testing
.
assert_close
(
sampled_modified_probs
[
i
],
probs
[
i
][
sampled_tokens
[
i
]])
if
save_logprobs
:
assert
sampled_logprobs
.
shape
==
(
bs
,
max_best_of
)
for
i
in
range
(
bs
):
for
best_of
in
range
(
max_best_of
):
assert
torch
.
all
(
sampled_logprobs
[
i
]
==
logprobs
[
i
][
sampled_tokens
[
i
,
best_of
]])
else
:
assert
sampled_logprobs
is
None
@
pytest
.
mark
.
parametrize
(
"random_sampling"
,
[
True
,
False
,
"mixed"
])
@
pytest
.
mark
.
parametrize
(
"max_best_of"
,
[
1
,
2
,
3
,
4
,
5
])
@
pytest
.
mark
.
parametrize
(
"modify_greedy_probs"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1337
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
SINGLE_SPLIT_VOCAB_SIZE
,
MULTI_SPLIT_VOCAB_SIZE
])
def
test_sample_prompt_logprobs
(
random_sampling
,
max_best_of
,
modify_greedy_probs
,
seed
,
vocab_size
):
set_random_seed
(
seed
)
prompt_sizes
=
[
16
,
32
,
64
,
128
]
*
2
samples
=
8
bs
=
samples
+
sum
(
prompt_sizes
)
probs
=
torch
.
zeros
((
bs
,
vocab_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
)
for
i
in
range
(
bs
):
probs
[
i
,
i
*
(
vocab_size
//
bs
)]
=
1.0
logprobs
=
torch
.
rand_like
(
probs
)
sample_indices
=
torch
.
tensor
(
prompt_sizes
,
dtype
=
torch
.
long
,
device
=
"cuda"
).
cumsum_
(
0
)
n_splits
=
get_num_triton_sampler_splits
(
probs
.
shape
[
1
])
if
random_sampling
==
"mixed"
:
random_sampling_mask
=
torch
.
rand
(
(
n_splits
,
samples
),
device
=
"cuda"
)
<
0.5
elif
random_sampling
:
random_sampling_mask
=
torch
.
ones
((
n_splits
,
samples
),
dtype
=
torch
.
bool
,
device
=
"cuda"
)
else
:
random_sampling_mask
=
torch
.
zeros
((
n_splits
,
samples
),
dtype
=
torch
.
bool
,
device
=
"cuda"
)
seeds
=
torch
.
randint
(
1
,
torch
.
iinfo
(
torch
.
long
).
max
,
(
n_splits
,
samples
),
device
=
"cuda"
).
mul_
(
random_sampling_mask
)
#ditto
with
patch
(
"vllm.model_executor.layers.ops.sample._sample_triton"
,
LibEntry
(
_sample_triton
)):
sampled_tokens
,
sampled_logprobs
,
_
=
sample
(
probs
=
probs
,
logprobs
=
logprobs
,
sample_indices
=
sample_indices
,
seeds
=
seeds
,
max_best_of
=
max_best_of
,
modify_greedy_probs
=
modify_greedy_probs
,
save_logprobs
=
True
)
assert
sampled_tokens
.
shape
==
(
samples
,
max_best_of
)
assert
sampled_logprobs
.
shape
==
(
samples
,
max_best_of
)
for
i
,
t
in
enumerate
(
sample_indices
):
assert
torch
.
all
(
sampled_tokens
[
i
]
==
t
*
(
vocab_size
//
bs
))
for
best_of
in
range
(
max_best_of
):
assert
torch
.
all
(
sampled_logprobs
[
i
]
==
logprobs
[
sample_indices
[
i
]]
[
sampled_tokens
[
i
,
best_of
]])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
16
)))
def
test_get_sequence_seeds
(
seed
):
"""Ensure that we get a different child seed from base
seed + extra entropy"""
starting_seed
=
seed
seq_seed
=
None
extra_entropy
=
1
for
i
in
range
(
512
):
new_seq_seed
=
SamplingTensors
.
_get_sequence_seeds
(
starting_seed
,
i
,
seeds_to_generate
=
1
,
is_greedy
=
False
)[
0
]
new_seq_seed_extra_entropy
=
SamplingTensors
.
_get_sequence_seeds
(
starting_seed
,
i
,
extra_entropy
,
seeds_to_generate
=
1
,
is_greedy
=
False
)[
0
]
assert
new_seq_seed_extra_entropy
!=
new_seq_seed
assert
seq_seed
!=
new_seq_seed
seq_seed
=
new_seq_seed
tests/kernels/test_utils.py
0 → 100644
View file @
539aa992
"""
Tests for miscellaneous utilities
"""
import
pytest
import
torch
from
tests.kernels.utils
import
opcheck
from
vllm.platforms
import
current_platform
def
test_convert_fp8_opcheck
():
data
=
torch
.
randn
((
256
,
256
),
dtype
=
torch
.
float32
,
device
=
"cuda"
)
result
=
torch
.
empty_like
(
data
,
dtype
=
torch
.
float8_e4m3fn
)
opcheck
(
torch
.
ops
.
_C_cache_ops
.
convert_fp8
,
(
result
,
data
,
1.0
,
"fp8"
))
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"Only supported for CUDA"
)
def
test_cuda_utils_opcheck
():
opcheck
(
torch
.
ops
.
_C_cuda_utils
.
get_device_attribute
,
(
0
,
0
))
opcheck
(
torch
.
ops
.
_C_cuda_utils
.
get_max_shared_memory_per_block_device_attribute
,
(
0
,
))
tests/kernels/utils.py
View file @
539aa992
...
...
@@ -2,12 +2,14 @@
import
itertools
import
random
import
unittest
from
numbers
import
Number
from
typing
import
(
Any
,
Dict
,
List
,
NamedTuple
,
Optional
,
Sequence
,
Tuple
,
Union
)
import
pytest
import
torch
from
torch._prims_common
import
TensorLikeType
from
vllm.attention
import
AttentionBackend
,
AttentionMetadata
,
AttentionType
from
vllm.utils
import
(
STR_BACKEND_ENV_VAR
,
STR_XFORMERS_ATTN_VAL
,
...
...
@@ -946,6 +948,34 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters,
output_under_test
.
view_as
(
ideal_output
))
# Copied/modified from torch._refs.__init__.py
def
fp8_allclose
(
a
:
TensorLikeType
,
b
:
TensorLikeType
,
rtol
:
float
=
1e-05
,
atol
:
float
=
1e-08
,
equal_nan
:
bool
=
False
,
)
->
bool
:
"""
Reference implementation of torch.allclose
"""
torch
.
_refs
.
_check_close_args
(
name
=
"torch.allclose"
,
a
=
a
,
b
=
b
,
rtol
=
rtol
,
atol
=
atol
)
return
bool
(
torch
.
all
(
torch
.
isclose
(
a
.
double
(),
b
.
double
(),
rtol
=
rtol
,
atol
=
atol
,
equal_nan
=
equal_nan
)).
item
())
# A special version of op check that has a restricted default set of test_utils
# and a patched version of allclose that supports fp8 types.
def
opcheck
(
op
:
Union
[
torch
.
_ops
.
OpOverload
,
torch
.
_ops
.
OpOverloadPacket
,
torch
.
_library
.
custom_ops
.
CustomOpDef
],
args
:
Tuple
[
Any
,
...],
...
...
@@ -954,9 +984,10 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
test_utils
:
Union
[
str
,
Sequence
[
str
]]
=
ALL_OPCHECK_TEST_UTILS
,
raise_exception
:
bool
=
True
,
cond
:
bool
=
True
)
->
Dict
[
str
,
str
]:
return
torch
.
library
.
opcheck
(
op
,
args
,
kwargs
,
test_utils
=
test_utils
,
raise_exception
=
raise_exception
)
if
cond
else
{}
with
unittest
.
mock
.
patch
(
'torch.allclose'
,
new
=
fp8_allclose
):
return
torch
.
library
.
opcheck
(
op
,
args
,
kwargs
,
test_utils
=
test_utils
,
raise_exception
=
raise_exception
)
if
cond
else
{}
tests/lora/conftest.py
View file @
539aa992
...
...
@@ -65,10 +65,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
to initialize torch.
"""
if
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
):
return
False
return
True
return
not
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
)
@
pytest
.
fixture
(
autouse
=
True
)
...
...
tests/lora/test_layers.py
View file @
539aa992
...
...
@@ -39,6 +39,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
get_masked_input_and_mask
)
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.utils
import
seed_everything
from
.utils
import
DummyLoRAManager
...
...
@@ -922,9 +923,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
seq_len
)
->
None
:
dtype
=
torch
.
float16
seed
=
0
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
max_loras
=
8
...
...
tests/lora/test_punica_sizes.py
View file @
539aa992
...
...
@@ -4,7 +4,6 @@ hidden_sizes included in the LoRA models currently supported by vLLM. It tests
whether the corresponding Triton kernel can run normally when tensor parallelism
is set to [1, 2, 4, 8, 16, 32, 64].
"""
import
random
from
unittest.mock
import
patch
import
pytest
...
...
@@ -17,6 +16,7 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand
from
vllm.lora.ops.sgmv_expand_slice
import
sgmv_expand_slice
from
vllm.lora.ops.sgmv_shrink
import
sgmv_shrink
from
vllm.triton_utils.libentry
import
LibEntry
from
vllm.utils
import
seed_everything
from
.utils
import
(
generate_data
,
generate_data_for_expand_nslices
,
ref_torch_groupgemm
)
...
...
@@ -145,11 +145,8 @@ def test_punica_sgmv(
seed
:
int
,
device
:
str
,
):
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
seq_length
=
128
(
...
...
@@ -172,6 +169,7 @@ def test_punica_sgmv(
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
token_nums
=
seq_len_tensor
.
sum
().
item
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
...
...
@@ -186,6 +184,7 @@ def test_punica_sgmv(
lora_indices_tensor
,
batches
,
max_seq_length
,
token_nums
,
scaling
,
)
else
:
...
...
@@ -198,6 +197,7 @@ def test_punica_sgmv(
lora_indices_tensor
,
batches
,
max_seq_length
,
token_nums
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
...
...
@@ -238,11 +238,8 @@ def test_punica_bgmv(
from
vllm.lora.ops.bgmv_expand
import
_bgmv_expand_kernel
from
vllm.lora.ops.bgmv_shrink
import
_bgmv_shrink_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
seq_length
=
1
(
...
...
@@ -329,11 +326,9 @@ def test_punica_expand_nslices(
):
from
vllm.lora.ops.bgmv_expand_slice
import
_bgmv_expand_slice_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
seq_length
=
128
if
op_type
==
"sgmv"
else
1
(
inputs_tensor
,
...
...
@@ -355,6 +350,7 @@ def test_punica_expand_nslices(
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
token_nums
=
seq_len_tensor
.
sum
().
item
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
...
...
@@ -372,6 +368,7 @@ def test_punica_expand_nslices(
lora_indices_tensor
,
batches
,
max_seq_length
,
token_nums
,
slice_offset
,
hidden_size
,
add_inputs
=
True
,
...
...
tests/lora/test_punica_variation.py
View file @
539aa992
...
...
@@ -3,7 +3,6 @@ This script is mainly used to test whether trtion kernels can run normally
under different conditions, including various batches, numbers of LoRA , and
maximum ranks.
"""
import
random
from
unittest.mock
import
patch
import
pytest
...
...
@@ -16,6 +15,7 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand
from
vllm.lora.ops.sgmv_expand_slice
import
sgmv_expand_slice
from
vllm.lora.ops.sgmv_shrink
import
sgmv_shrink
from
vllm.triton_utils.libentry
import
LibEntry
from
vllm.utils
import
seed_everything
from
.utils
import
(
generate_data
,
generate_data_for_expand_nslices
,
ref_torch_groupgemm
)
...
...
@@ -60,11 +60,8 @@ def test_punica_sgmv(
seed
:
int
,
device
:
str
,
):
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
seq_length
=
128
(
...
...
@@ -87,6 +84,7 @@ def test_punica_sgmv(
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
token_nums
=
seq_len_tensor
.
sum
().
item
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
...
...
@@ -101,6 +99,7 @@ def test_punica_sgmv(
lora_indices_tensor
,
batches
,
max_seq_length
,
token_nums
,
scaling
,
)
else
:
...
...
@@ -113,6 +112,7 @@ def test_punica_sgmv(
lora_indices_tensor
,
batches
,
max_seq_length
,
token_nums
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
...
...
@@ -153,11 +153,8 @@ def test_punica_bgmv(
from
vllm.lora.ops.bgmv_expand
import
_bgmv_expand_kernel
from
vllm.lora.ops.bgmv_shrink
import
_bgmv_shrink_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
seq_length
=
1
(
...
...
@@ -244,11 +241,9 @@ def test_punica_expand_nslices(
):
from
vllm.lora.ops.bgmv_expand_slice
import
_bgmv_expand_slice_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
seq_length
=
128
if
op_type
==
"sgmv"
else
1
(
inputs_tensor
,
...
...
@@ -270,6 +265,7 @@ def test_punica_expand_nslices(
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
token_nums
=
seq_len_tensor
.
sum
().
item
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
...
...
@@ -287,6 +283,7 @@ def test_punica_expand_nslices(
lora_indices_tensor
,
batches
,
max_seq_length
,
token_nums
,
slice_offset
,
hidden_size
,
add_inputs
=
True
,
...
...
tests/models/decoder_only/language/test_big_models.py
View file @
539aa992
...
...
@@ -5,7 +5,8 @@ This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import
pytest
import
torch
from
vllm.platforms
import
current_platform
from
...utils
import
check_outputs_equal
...
...
@@ -19,10 +20,12 @@ MODELS = [
# "Qwen/Qwen1.5-0.5B" # Broken,
]
if
not
current_platform
.
is_cpu
():
# MiniCPM requires fused_moe which is not supported by CPU
MODELS
.
append
(
"openbmb/MiniCPM3-4B"
)
#TODO: remove this after CPU float16 support ready
target_dtype
=
"float"
if
torch
.
cuda
.
is_available
():
target_dtype
=
"half"
target_dtype
=
"float"
if
current_platform
.
is_cpu
()
else
"half"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
@@ -39,7 +42,7 @@ def test_models(
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
...
...
@@ -57,7 +60,7 @@ def test_model_print(
model
:
str
,
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
...
...
tests/models/decoder_only/language/test_granite.py
View file @
539aa992
...
...
@@ -2,23 +2,18 @@
Run `pytest tests/models/test_granite.py`.
"""
import
importlib.metadata
import
pytest
import
transformers
from
...utils
import
check_logprobs_close
TRANSFORMERS_VERSION
=
tuple
(
map
(
int
,
importlib
.
metadata
.
version
(
"transformers"
).
split
(
"."
)))
MODELS
=
[
"ibm/PowerLM-3b"
,
]
# GraniteForCausalLM will be in transformers >= 4.45
@
pytest
.
mark
.
skipif
(
TRANSFORMERS_VERSION
<
(
4
,
45
)
,
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.
45
"
,
reason
=
"granite model test requires transformers >= 4.45"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
...
...
tests/models/decoder_only/language/test_mistral.py
View file @
539aa992
...
...
@@ -4,13 +4,65 @@ Run `pytest tests/models/test_mistral.py`.
"""
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
check_logprobs_close
MODELS
=
[
"mistralai/Mistral-7B-Instruct-v0.1"
,
"mistralai/Mistral-7B-Instruct-v0.3"
,
# Mistral-Nemo is to big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407"
]
SAMPLING_PARAMS
=
SamplingParams
(
max_tokens
=
512
,
temperature
=
0.0
,
logprobs
=
5
)
SYMBOLIC_LANG_PROMPTS
=
[
"勇敢な船乗りについての詩を書く"
,
# japanese
"寫一首關於勇敢的水手的詩"
,
# chinese
]
# for function calling
TOOLS
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
},
"state"
:
{
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
]
}
},
"required"
:
[
"city"
,
"state"
,
"unit"
]
}
}
}]
MSGS
=
[{
"role"
:
"user"
,
"content"
:
(
"Can you tell me what the temperate"
" will be in Dallas, in fahrenheit?"
)
}]
EXPECTED_FUNC_CALL
=
(
'[{"name": "get_current_weather", "arguments": '
'{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]'
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
...
...
@@ -81,3 +133,42 @@ def test_mistral_format(
name_0
=
"hf"
,
name_1
=
"mistral"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
[
1
:])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"prompt"
,
SYMBOLIC_LANG_PROMPTS
)
def
test_mistral_symbolic_languages
(
model
:
str
,
dtype
:
str
,
prompt
:
str
,
)
->
None
:
prompt
=
"hi"
msg
=
{
"role"
:
"user"
,
"content"
:
prompt
}
llm
=
LLM
(
model
=
model
,
dtype
=
dtype
,
max_model_len
=
8192
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
)
outputs
=
llm
.
chat
([
msg
],
sampling_params
=
SAMPLING_PARAMS
)
assert
"�"
not
in
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
[
1
:])
# v1 can't do func calling
def
test_mistral_function_calling
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
)
as
vllm_model
:
outputs
=
vllm_model
.
model
.
chat
(
MSGS
,
tools
=
TOOLS
,
sampling_params
=
SAMPLING_PARAMS
)
assert
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
==
EXPECTED_FUNC_CALL
tests/models/decoder_only/vision_language/test_llava_next_video.py
View file @
539aa992
...
...
@@ -105,9 +105,6 @@ def run_test(
for
asset
in
video_assets
]
for
video
in
videos
:
print
(
video
.
shape
)
if
size_factors
is
not
None
:
inputs_per_video
=
[(
[
prompt
for
_
in
size_factors
],
...
...
tests/models/decoder_only/vision_language/test_llava_onevision.py
0 → 100644
View file @
539aa992
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
import
pytest
import
transformers
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
from
vllm.multimodal.utils
import
(
rescale_image_size
,
rescale_video_size
,
resize_video
,
sample_frames_from_video
)
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
(
VIDEO_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_VideoAssets
)
from
...utils
import
check_logprobs_close
# Video test
HF_VIDEO_PROMPTS
=
VIDEO_ASSETS
.
prompts
({
"sample_demo_1"
:
"<|im_start|>user <video>
\n
why is this video funny?
\
<|im_end|><|im_start|>assistant
\n
"
})
models
=
[
"llava-hf/llava-onevision-qwen2-7b-ov-hf"
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
config
=
AutoConfig
.
from_pretrained
(
model
)
video_token_id
=
config
.
video_token_index
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
eos_token_id
=
tokenizer
.
eos_token_id
hf_output_ids
=
[
token_id
for
idx
,
token_id
in
enumerate
(
output_ids
)
if
token_id
!=
video_token_id
or
output_ids
[
idx
-
1
]
!=
video_token_id
]
hf_output_str
=
output_str
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
@
overload
def
run_video_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
,
model
:
str
,
*
,
size_factors
:
List
[
float
],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_frames
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
...
@
overload
def
run_video_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
,
model
:
str
,
*
,
sizes
:
List
[
Tuple
[
int
,
int
]],
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_frames
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
...
def
run_video_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
,
model
:
str
,
*
,
size_factors
:
Optional
[
List
[
float
]]
=
None
,
sizes
:
Optional
[
List
[
Tuple
[
int
,
int
]]]
=
None
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_frames
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
videos
=
[
sample_frames_from_video
(
asset
.
np_ndarrays
,
num_frames
)
for
asset
in
video_assets
]
if
size_factors
is
not
None
:
inputs_per_video
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_video_size
(
video
,
factor
)
for
factor
in
size_factors
],
)
for
video
,
prompt
in
zip
(
videos
,
HF_VIDEO_PROMPTS
)]
elif
sizes
is
not
None
:
inputs_per_video
=
[(
[
prompt
for
_
in
sizes
],
[
resize_video
(
video
,
size
)
for
size
in
sizes
],
)
for
video
,
prompt
in
zip
(
videos
,
HF_VIDEO_PROMPTS
)]
else
:
raise
ValueError
(
"You must provide either `size_factors` or `sizes`"
)
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
4096
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_video
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
videos
=
videos
)
for
prompts
,
videos
in
inputs_per_video
]
def
process
(
hf_inputs
:
BatchEncoding
):
hf_inputs
[
"pixel_values_videos"
]
=
hf_inputs
[
"pixel_values_videos"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_video
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
videos
=
videos
)
for
prompts
,
videos
in
inputs_per_video
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_video
,
vllm_outputs_per_video
):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No video
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
16
])
def
test_models
(
hf_runner
,
vllm_runner
,
video_assets
,
model
,
size_factors
,
dtype
,
max_tokens
,
num_logprobs
,
num_frames
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/videos.
For huggingface runner, we provide the np.ndarray as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
run_video_test
(
hf_runner
,
vllm_runner
,
video_assets
,
model
,
size_factors
=
size_factors
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
num_frames
=
num_frames
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"sizes"
,
[[(
1669
,
2560
),
(
2560
,
1669
),
(
183
,
488
),
(
488
,
183
)]],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
16
])
def
test_models_fixed_sizes
(
hf_runner
,
vllm_runner
,
video_assets
,
model
,
sizes
,
dtype
,
max_tokens
,
num_logprobs
,
num_frames
)
->
None
:
run_video_test
(
hf_runner
,
vllm_runner
,
video_assets
,
model
,
sizes
=
sizes
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
num_frames
=
num_frames
,
tensor_parallel_size
=
1
,
)
# Image test
_LIMIT_IMAGE_PER_PROMPT
=
4
def
run_image_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
inputs
:
List
[
Tuple
[
List
[
str
],
PromptImageInput
]],
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
32768
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
_LIMIT_IMAGE_PER_PROMPT
})
as
vllm_model
:
vllm_outputs_per_image
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
]
def
process
(
hf_inputs
:
BatchEncoding
):
hf_inputs
[
"pixel_values"
]
=
hf_inputs
[
"pixel_values"
]
\
.
to
(
torch_dtype
)
# type: ignore
return
hf_inputs
with
hf_runner
(
model
,
dtype
=
dtype
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_image
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_multiple_image_inputs
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
cherry_blossom
=
image_assets
[
1
].
pil_image
inputs
=
[(
[
"<|im_start|>user <image><image>
\n
Describe 2 images.
\
<|im_end|><|im_start|>assistant
\n
"
,
"<|im_start|>user <image><image>
\n
Describe 2 images.
\
<|im_end|><|im_start|>assistant
\n
"
,
"<|im_start|>user <image><image><image><image>
\n
Describe 4 images.
\
<|im_end|><|im_start|>assistant
\n
"
,
"<|im_start|>user <image>
\n
What is the season?
\
<|im_end|><|im_start|>assistant
\n
"
,
],
[
[
stop_sign
,
cherry_blossom
],
# Images with different sizes and aspect-ratios
[
rescale_image_size
(
stop_sign
,
0.1
),
stop_sign
,
],
[
stop_sign
,
rescale_image_size
(
stop_sign
,
0.25
),
cherry_blossom
.
resize
((
183
,
488
)),
cherry_blossom
.
resize
((
488
,
183
))
],
cherry_blossom
,
])]
run_image_test
(
hf_runner
,
vllm_runner
,
inputs
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_phi3v.py
View file @
539aa992
import
os
import
re
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
Callable
,
List
,
Optional
,
Tuple
,
Type
import
pytest
from
transformers
import
AutoTokenizer
import
torch
from
transformers
import
AutoImageProcessor
,
AutoTokenizer
from
vllm.inputs
import
InputContext
,
LLMInputs
from
vllm.model_executor.models.phi3v
import
_IMAGE_TOKEN_ID
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
,
is_hip
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
...utils
import
check_logprobs_close
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
...utils
import
build_model_context
,
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
...
...
@@ -71,7 +76,7 @@ def run_test(
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
...
...
@@ -230,3 +235,174 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
### Fast tests for correctness in processor_kwarg override handling
# Wrap lazy imports to avoid initializing CUDA during test collection
@
pytest
.
fixture
()
def
input_processor_for_phi3v
():
from
vllm.model_executor.models.phi3v
import
input_processor_for_phi3v
return
input_processor_for_phi3v
@
pytest
.
fixture
()
def
dummy_data_for_phi3v
():
from
vllm.model_executor.models.phi3v
import
dummy_data_for_phi3v
return
dummy_data_for_phi3v
@
pytest
.
fixture
()
def
get_max_phi3v_image_tokens
():
from
vllm.model_executor.models.phi3v
import
get_max_phi3v_image_tokens
return
get_max_phi3v_image_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops"
,
[
4
,
16
,
None
])
def
test_input_mapper_override
(
model
:
str
,
image_assets
:
_ImageAssets
,
num_crops
:
Optional
[
int
]):
"""Ensure that the [default] input mapper handles num_crops properly."""
# We pass the processor kwargs here since for this model, we fall back to
# the default mapper; this will fall back to the HF mapper and forward
# mm_processor_kwargs to it.
mm_processor_kwargs
=
{
"num_crops"
:
num_crops
}
if
num_crops
is
not
None
else
{}
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
hf_processor
=
AutoImageProcessor
.
from_pretrained
(
model
,
trust_remote_code
=
True
,
**
mm_processor_kwargs
)
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
image
=
image_assets
[
0
].
pil_image
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
)
vllm_result
=
mm_registry
.
map_input
(
ctx
.
model_config
,
{
"image"
:
image
},
)
assert
torch
.
all
(
hf_result
[
"image_sizes"
]
==
vllm_result
[
"image_sizes"
])
assert
torch
.
all
(
hf_result
[
"num_img_tokens"
]
==
vllm_result
[
"num_img_tokens"
])
# For pixel values, the second axis should be the num_crops + 1
# for the rescaled original image. The default value in VLLM falls
# back to the HF config, which is why we compare to the processor num_crops
assert
torch
.
all
(
hf_result
[
"pixel_values"
]
==
vllm_result
[
"pixel_values"
])
assert
vllm_result
[
"pixel_values"
].
shape
[
1
]
==
hf_processor
.
num_crops
+
1
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,expected_max_tokens"
,
[
(
4
,
781
),
(
16
,
2653
),
])
def
test_max_tokens_override
(
get_max_phi3v_image_tokens
:
Callable
,
model
:
str
,
num_crops
:
int
,
expected_max_tokens
:
int
):
"""Ensure get_max_phi3v_image_tokens handles num_crops properly."""
# NOTE: mm_processor_kwargs on the context in this test is unused, since
# this is testing the mapper directly. In practice, the processor kwargs
# are wrapped in a closure when calling the max tokens func. We explicitly
# do NOT use the mm_processor_kwargs in the model context here to ensure
# that the max image tokens implementation is referencing a mix of the
# kwargs to the function and the original mm_processor_kwargs in case
# values are somehow updated and end up in a bad state.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
actual_max_tokens
=
get_max_phi3v_image_tokens
(
InputContext
(
ctx
.
model_config
),
num_crops
=
num_crops
,
)
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,toks_per_img,num_imgs"
,
[
(
4
,
781
,
1
),
(
4
,
781
,
2
),
(
16
,
2653
,
1
),
(
16
,
2653
,
2
),
])
def
test_dummy_data_override
(
dummy_data_for_phi3v
:
Callable
,
model
:
str
,
num_crops
:
int
,
toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure dummy_data_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
sequence_data
,
_
,
=
dummy_data_for_phi3v
(
ctx
=
ctx
,
seq_len
=
8192
,
# Should be bigger than num_imgs * toks_per_img
mm_counts
=
{
"image"
:
num_imgs
},
num_crops
=
num_crops
,
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
sequence_data
.
get_token_ids
().
count
(
_IMAGE_TOKEN_ID
)
assert
img_tok_count
==
toks_per_img
*
num_imgs
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_crops,expected_toks_per_img,num_imgs"
,
[
(
4
,
757
,
1
),
(
4
,
757
,
2
),
(
16
,
1921
,
1
),
(
16
,
1921
,
2
),
])
def
test_input_processor_override
(
input_processor_for_phi3v
:
Callable
,
image_assets
:
_ImageAssets
,
model
:
str
,
num_crops
:
int
,
expected_toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
# Build the image str / prompt based on the number of images we pass
img_str
=
""
.
join
([
f
"<|image_
{
idx
}
|>
\n
"
for
idx
in
range
(
1
,
num_imgs
+
1
)])
prompt
=
f
"<|user|>
\n
{
img_str
}
<|end|>
\n
<|assistant|>
\n
"
images
=
[
image_assets
[
0
].
pil_image
]
*
num_imgs
llm_inputs
=
LLMInputs
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
),
prompt
=
prompt
,
multi_modal_data
=
{
"image"
:
images
})
proc_llm_inputs
=
input_processor_for_phi3v
(
ctx
=
ctx
,
llm_inputs
=
llm_inputs
,
num_crops
=
num_crops
,
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
proc_llm_inputs
[
"prompt_token_ids"
].
count
(
_IMAGE_TOKEN_ID
)
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
tests/models/decoder_only/vision_language/test_qwen.py
View file @
539aa992
...
...
@@ -5,14 +5,13 @@ import pytest
import
torch
from
PIL.Image
import
Image
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
InputContext
,
LLMInputs
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
rescale_image_size
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
...utils
import
check_logprobs_close
from
...utils
import
build_model_context
,
check_logprobs_close
text_only_models
=
[
"Qwen/Qwen-7B-Chat"
# Has no visual component
...
...
@@ -42,32 +41,6 @@ VIS_ENC_DIM = 4096
IMG_SIZE
=
448
def
build_model_context
(
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
):
"""Creates an InputContext for a given model.
Args:
model_name: Name of the model being considered.
tokenizer_name: Name of the tokenizer being considered.
trust_remote_code: Whether or not to allow loading remote code.
Returns:
InputContext for the model being considered.
"""
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
model_config
=
ModelConfig
(
model_name
,
tokenizer_name
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
trust_remote_code
,
dtype
=
"float32"
,
seed
=
0
,
)
return
InputContext
(
model_config
)
@
pytest
.
fixture
()
def
input_mapper_for_qwen
():
# Lazy import to avoid initializing CUDA during test collection
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment