Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9c4ecf15
Commit
9c4ecf15
authored
Apr 14, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.4' into v0.8.4-ori
parents
bfc2d6f7
dc1b4a6f
Changes
342
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1075 additions
and
324 deletions
+1075
-324
tests/kernels/test_block_int8.py
tests/kernels/test_block_int8.py
+199
-0
tests/kernels/test_flashmla.py
tests/kernels/test_flashmla.py
+1
-1
tests/kernels/test_int8_kernel.py
tests/kernels/test_int8_kernel.py
+149
-0
tests/kernels/test_merge_attn_states.py
tests/kernels/test_merge_attn_states.py
+265
-0
tests/kernels/test_triton_moe_ptpc_fp8.py
tests/kernels/test_triton_moe_ptpc_fp8.py
+159
-0
tests/kernels/utils_block.py
tests/kernels/utils_block.py
+63
-0
tests/lora/conftest.py
tests/lora/conftest.py
+12
-0
tests/lora/test_baichuan.py
tests/lora/test_baichuan.py
+0
-1
tests/lora/test_chatglm3_tp.py
tests/lora/test_chatglm3_tp.py
+0
-1
tests/lora/test_layers.py
tests/lora/test_layers.py
+1
-1
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+0
-1
tests/lora/test_punica_ops.py
tests/lora/test_punica_ops.py
+5
-0
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+1
-8
tests/lora/test_transfomers_model.py
tests/lora/test_transfomers_model.py
+0
-1
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+13
-1
tests/models/decoder_only/language/test_gguf.py
tests/models/decoder_only/language/test_gguf.py
+60
-20
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+57
-20
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+0
-236
tests/models/decoder_only/vision_language/test_phi4mm.py
tests/models/decoder_only/vision_language/test_phi4mm.py
+80
-17
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+10
-16
No files found.
tests/kernels/test_block_int8.py
0 → 100644
View file @
9c4ecf15
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
import
itertools
import
pytest
import
torch
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.quantization.utils.int8_utils
import
(
w8a8_block_int8_matmul
)
from
vllm.platforms
import
current_platform
from
.utils_block
import
native_w8a8_block_matmul
if
current_platform
.
get_device_capability
()
<
(
7
,
0
):
pytest
.
skip
(
"INT8 Triton requires CUDA 7.0 or higher"
,
allow_module_level
=
True
)
# For test
def
native_per_token_group_quant_int8
(
x
,
group_size
,
eps
=
1e-10
,
dtype
=
torch
.
int8
):
"""Function to perform per-token-group quantization on an input tensor
`x` using native torch.
It converts the tensor values into int8 values and returns the
quantized tensor along with the scaling factor used for quantization.
"""
assert
(
x
.
shape
[
-
1
]
%
group_size
==
0
),
"the last dimension of `x` cannot be divisible by `group_size`"
assert
x
.
is_contiguous
(),
"`x` is not contiguous"
iinfo
=
torch
.
iinfo
(
dtype
)
int8_min
=
iinfo
.
min
int8_max
=
iinfo
.
max
x_
=
x
.
reshape
(
x
.
numel
()
//
group_size
,
group_size
)
# Use float32 for scale calculation for stability
amax
=
x_
.
abs
().
max
(
dim
=-
1
,
keepdim
=
True
)[
0
].
clamp
(
min
=
eps
).
to
(
torch
.
float32
)
x_s
=
amax
/
int8_max
x_q
=
(
x_
.
to
(
torch
.
float32
)
/
x_s
).
round
().
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
dtype
)
# Round before clamping
x_q
=
x_q
.
reshape
(
x
.
shape
)
x_s
=
x_s
.
reshape
(
x
.
shape
[:
-
1
]
+
(
x
.
shape
[
-
1
]
//
group_size
,
))
return
x_q
,
x_s
# For test
def
torch_w8a8_block_int8_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
,
block_shape
):
"""This function performs fused moe with block-wise quantization using
native torch."""
B
,
D
=
a
.
shape
a
=
a
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
out
=
torch
.
zeros
(
B
*
topk
,
w2
.
shape
[
1
],
dtype
=
a
.
dtype
,
device
=
a
.
device
)
score
=
torch
.
softmax
(
score
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk
)
topk_weight
=
topk_weight
.
view
(
-
1
)
topk_ids
=
topk_ids
.
view
(
-
1
)
_
,
block_k
=
block_shape
[
0
],
block_shape
[
1
]
a_q
,
a_s
=
native_per_token_group_quant_int8
(
a
,
block_k
)
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
inter_out
=
native_w8a8_block_matmul
(
a_q
[
mask
],
w1
[
i
],
a_s
[
mask
],
w1_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
act_out_q
,
act_out_s
=
native_per_token_group_quant_int8
(
act_out
,
block_k
)
act_out
=
act_out
.
to
(
torch
.
float32
)
out
[
mask
]
=
native_w8a8_block_matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
M
=
[
1
,
33
,
64
,
222
]
N
=
[
128
,
1024
]
K
=
[
256
,
4096
]
E
=
[
8
,
24
]
TOP_KS
=
[
2
,
6
]
# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
BLOCK_SIZE
=
[[
128
,
128
]]
SEEDS
=
[
0
]
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"module"
)
def
setup_cuda
():
"""Sets the default CUDA device for all tests in this module."""
torch
.
set_default_device
(
"cuda"
)
@
pytest
.
mark
.
parametrize
(
"M,N,K,block_size,out_dtype,seed"
,
itertools
.
product
(
M
,
N
,
K
,
BLOCK_SIZE
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_block_int8_matmul
(
M
,
N
,
K
,
block_size
,
out_dtype
,
seed
):
torch
.
manual_seed
(
seed
)
factor_for_scale
=
1e-2
int8_info
=
torch
.
iinfo
(
torch
.
int8
)
int8_max
,
int8_min
=
int8_info
.
max
,
int8_info
.
min
A_fp32
=
(
torch
.
rand
(
M
,
K
,
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
A_fp8
=
A_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
float8_e4m3fn
)
B_fp32
=
(
torch
.
rand
(
N
,
K
,
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
B_fp8
=
B_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
float8_e4m3fn
)
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
n_tiles
=
(
N
+
block_n
-
1
)
//
block_n
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
As
=
torch
.
rand
(
M
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
Bs
=
torch
.
rand
(
n_tiles
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
out
=
w8a8_block_int8_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.001
@
pytest
.
mark
.
parametrize
(
"M, N, K, E, topk, block_size, dtype, seed"
,
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
BLOCK_SIZE
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_block_int8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
block_size
,
dtype
,
seed
):
"""Tests the fused_moe kernel with W8A8 INT8 block quantization against a
native torch reference."""
torch
.
manual_seed
(
seed
)
# Use a smaller factor for scale initialization to prevent large
# values/overflow especially when output dtype might be float16
factor_for_scale
=
1e-2
int8_info
=
torch
.
iinfo
(
torch
.
int8
)
int8_max
,
int8_min
=
int8_info
.
max
,
int8_info
.
min
a
=
torch
.
randn
((
M
,
K
),
dtype
=
dtype
)
/
10
w1_fp32
=
(
torch
.
rand
(
(
E
,
2
*
N
,
K
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
w1
=
w1_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
w2_fp32
=
(
torch
.
rand
((
E
,
K
,
N
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
w2
=
w2_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
n_tiles_w1
=
(
2
*
N
+
block_n
-
1
)
//
block_n
n_tiles_w2
=
(
K
+
block_n
-
1
)
//
block_n
k_tiles_w1
=
(
K
+
block_k
-
1
)
//
block_k
k_tiles_w2
=
(
N
+
block_k
-
1
)
//
block_k
w1_s
=
(
torch
.
rand
(
(
E
,
n_tiles_w1
,
k_tiles_w1
),
dtype
=
torch
.
float32
)
*
factor_for_scale
)
w2_s
=
(
torch
.
rand
(
(
E
,
n_tiles_w2
,
k_tiles_w2
),
dtype
=
torch
.
float32
)
*
factor_for_scale
)
score
=
torch
.
randn
((
M
,
E
),
dtype
=
dtype
)
# Set the context to avoid lots of warning spam.
vllm_config
=
VllmConfig
()
with
set_current_vllm_config
(
vllm_config
):
out
=
fused_moe
(
a
,
w1
,
w2
,
score
,
topk
,
renormalize
=
False
,
use_int8_w8a8
=
True
,
w1_scale
=
w1_s
,
w2_scale
=
w2_s
,
block_shape
=
block_size
,
)
ref_out
=
torch_w8a8_block_int8_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
,
block_size
)
# Check results
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.06
tests/kernels/test_flashmla.py
View file @
9c4ecf15
...
@@ -124,7 +124,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
...
@@ -124,7 +124,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
cal_diff
(
out_flash
,
out_torch
,
"out"
)
cal_diff
(
out_flash
,
out_torch
,
"out"
)
cal_diff
(
lse_flash
,
lse_torch
,
"lse"
)
cal_diff
(
lse_flash
,
lse_torch
,
"lse"
)
t
=
triton
.
testing
.
do_bench
(
flash_mla
,
fast_flush
=
False
)
t
=
triton
.
testing
.
do_bench
(
flash_mla
)
FLOPS
=
s_q
*
total_seqlens
*
h_q
*
(
d
+
dv
)
*
2
FLOPS
=
s_q
*
total_seqlens
*
h_q
*
(
d
+
dv
)
*
2
bytes
=
(
total_seqlens
*
h_kv
*
d
+
b
*
s_q
*
h_q
*
d
+
bytes
=
(
total_seqlens
*
h_kv
*
d
+
b
*
s_q
*
h_q
*
d
+
b
*
s_q
*
h_q
*
dv
)
*
(
torch
.
finfo
(
dtype
).
bits
//
8
)
b
*
s_q
*
h_q
*
dv
)
*
(
torch
.
finfo
(
dtype
).
bits
//
8
)
...
...
tests/kernels/test_int8_kernel.py
0 → 100644
View file @
9c4ecf15
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
import
itertools
import
pytest
import
torch
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.quantization.utils.int8_utils
import
(
per_token_quant_int8
)
from
vllm.platforms
import
current_platform
if
current_platform
.
get_device_capability
()
<
(
7
,
0
):
pytest
.
skip
(
"INT8 Triton requires CUDA 7.0 or higher"
,
allow_module_level
=
True
)
def
native_w8a8_per_token_matmul
(
A
,
B
,
As
,
Bs
,
output_dtype
=
torch
.
float16
):
"""Matrix multiplication function that supports per-token input
quantization and per-column weight quantization"""
A
=
A
.
to
(
torch
.
float32
)
B
=
B
.
to
(
torch
.
float32
)
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
],
"Dimension mismatch"
assert
B
.
ndim
==
2
and
B
.
is_contiguous
(
),
"B must be a 2D contiguous tensor"
# Reshape input
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
B
=
B
.
t
()
# Transpose weight matrix
N
,
K
=
B
.
shape
origin_C_shape
=
A
.
shape
[:
-
1
]
+
(
K
,
)
A
=
A
.
reshape
(
M
,
N
)
# As is per-token [M, 1], Bs is per-column [1, K]
C
=
torch
.
matmul
(
A
,
B
)
# [M, K]
C
=
As
*
C
*
Bs
.
view
(
1
,
-
1
)
# Broadcast per-column scale
return
C
.
reshape
(
origin_C_shape
).
to
(
output_dtype
)
def
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
):
"""This function performs fused moe with per-column int8 quantization
using native torch."""
B
,
D
=
a
.
shape
# Perform per-token quantization
a_q
,
a_s
=
per_token_quant_int8
(
a
)
# Repeat tokens to match topk
a_q
=
a_q
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
# Also repeat the scale
a_s
=
a_s
.
view
(
B
,
-
1
,
1
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
1
)
# [B*topk, 1]
out
=
torch
.
zeros
(
B
*
topk
,
w2
.
shape
[
1
],
dtype
=
a
.
dtype
,
device
=
a
.
device
)
# Calculate routing
score
=
torch
.
softmax
(
score
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk
)
topk_weight
=
topk_weight
.
view
(
-
1
)
topk_ids
=
topk_ids
.
view
(
-
1
)
# Process each expert
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
# First MLP layer: note that a_s is now per-token
inter_out
=
native_w8a8_per_token_matmul
(
a_q
[
mask
],
w1
[
i
],
a_s
[
mask
],
w1_s
[
i
],
output_dtype
=
a
.
dtype
)
# Activation function
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
# Quantize activation output with per-token
act_out_q
,
act_out_s
=
per_token_quant_int8
(
act_out
)
# Second MLP layer
out
[
mask
]
=
native_w8a8_per_token_matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
output_dtype
=
a
.
dtype
)
# Apply routing weights and sum
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"module"
)
def
setup_cuda
():
"""Sets the default CUDA device for all tests in this module."""
torch
.
set_default_device
(
"cuda"
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
M
=
[
1
,
33
]
N
=
[
128
,
1024
]
K
=
[
256
,
4096
]
E
=
[
8
]
TOP_KS
=
[
2
,
6
]
SEEDS
=
[
0
]
@
pytest
.
mark
.
parametrize
(
"M, N, K, E, topk, dtype, seed"
,
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_fp8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
dtype
,
seed
):
torch
.
manual_seed
(
seed
)
# Initialize int8 quantization parameters
factor_for_scale
=
1e-2
int8_max
=
127
int8_min
=
-
128
# Input tensor
# M * K
a
=
torch
.
randn
((
M
,
K
),
dtype
=
dtype
)
/
10
# Generate int8 weights
w1_fp32
=
(
torch
.
rand
((
E
,
2
*
N
,
K
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w1
=
(
w1_fp32
*
int8_max
).
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
w2_fp32
=
(
torch
.
rand
((
E
,
K
,
N
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w2
=
(
w2_fp32
*
int8_max
).
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
# Generate scale for each column (per-column quantization)
w1_s
=
torch
.
rand
(
E
,
2
*
N
,
device
=
w1_fp32
.
device
)
*
factor_for_scale
w2_s
=
torch
.
rand
(
E
,
K
,
device
=
w2_fp32
.
device
)
*
factor_for_scale
score
=
torch
.
randn
((
M
,
E
),
dtype
=
dtype
)
ref_out
=
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
)
out
=
fused_moe
(
a
,
w1
,
w2
,
score
,
topk
,
renormalize
=
False
,
use_int8_w8a8
=
True
,
# Using int8-w8a8
per_channel_quant
=
True
,
w1_scale
=
w1_s
,
w2_scale
=
w2_s
,
block_shape
=
None
,
# Not using block quantization
)
# Check results
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.05
tests/kernels/test_merge_attn_states.py
0 → 100644
View file @
9c4ecf15
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
pytest
import
torch
from
vllm._custom_ops
import
merge_attn_states
as
merge_attn_states_cuda
from
vllm.attention.ops.triton_merge_attn_states
import
(
merge_attn_states
as
merge_attn_states_triton
)
from
vllm.platforms
import
current_platform
# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
# can be used to combine partial attention results (in the split-KV case)
def
merge_attn_states_torch
(
output
:
torch
.
Tensor
,
# [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
prefix_output
:
torch
.
Tensor
,
# [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
prefix_lse
:
torch
.
Tensor
,
# [NUM_HEADS, NUM_TOKENS]
suffix_output
:
torch
.
Tensor
,
# [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
suffix_lse
:
torch
.
Tensor
,
# [NUM_HEADS, NUM_TOKENS]
output_lse
:
Optional
[
torch
.
Tensor
]
=
None
,
# [NUM_HEADS, NUM_TOKENS]
):
p_lse
=
prefix_lse
s_lse
=
suffix_lse
# inf -> -inf
p_lse
[
p_lse
==
torch
.
inf
]
=
-
torch
.
inf
s_lse
[
s_lse
==
torch
.
inf
]
=
-
torch
.
inf
# max_lse [NUM_HEADS, NUM_TOKENS]
max_lse
=
torch
.
maximum
(
p_lse
,
s_lse
)
p_lse
=
p_lse
-
max_lse
s_lse
=
s_lse
-
max_lse
p_lse_exp
=
torch
.
exp
(
p_lse
)
s_lse_exp
=
torch
.
exp
(
s_lse
)
out_se
=
(
p_lse_exp
+
s_lse_exp
)
if
output_lse
is
not
None
:
output_lse
=
torch
.
log
(
out_se
)
+
max_lse
p_scale
=
p_lse_exp
/
out_se
# [NUM_HEADS, NUM_TOKENS]
s_scale
=
s_lse_exp
/
out_se
# [NUM_HEADS, NUM_TOKENS]
p_scale
=
torch
.
transpose
(
p_scale
,
0
,
1
).
unsqueeze
(
2
)
# [NUM_TOKENS, NUM_HEADS, 1]
s_scale
=
torch
.
transpose
(
s_scale
,
0
,
1
).
unsqueeze
(
2
)
# [NUM_TOKENS, NUM_HEADS, 1]
output
=
prefix_output
*
p_scale
+
suffix_output
*
s_scale
return
output
,
output_lse
NUM_BATCH_TOKENS
=
[
256
,
512
,
613
,
1024
,
1536
,
4096
]
NUM_QUERY_HEADS
=
[
4
,
8
,
16
,
32
,
48
,
64
]
HEAD_SIZES
=
[
32
,
48
,
64
,
96
,
128
,
256
]
DTYPES
=
[
torch
.
float32
,
torch
.
half
,
torch
.
bfloat16
]
all_case_info
:
list
[
tuple
]
=
[]
def
generate_markdown_table
():
global
all_case_info
table_header
=
(
"| tokens | heads | headsize | dtype "
"| device | torch | triton | cuda | speedup |"
)
table_separator
=
"| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
def
shortly_dtype
(
dtype
:
torch
.
dtype
)
->
str
:
return
str
(
dtype
).
removeprefix
(
"torch."
)
def
shortly_device
(
device
:
str
)
->
str
:
return
device
.
removeprefix
(
"NVIDIA"
).
strip
()
print
(
table_header
)
print
(
table_separator
)
for
info
in
all_case_info
:
(
num_tokens
,
num_heads
,
head_size
,
dtype
,
device
,
avg_time_torch_kernel
,
avg_time_triton_kernel
,
avg_time_cuda_kernel
,
performance_improved
)
=
info
dtype
=
shortly_dtype
(
dtype
)
device
=
shortly_device
(
device
)
print
(
f
"|
{
num_tokens
}
|
{
num_heads
}
|
{
head_size
}
"
f
"|
{
dtype
}
|
{
device
}
|
{
avg_time_torch_kernel
:.
5
f
}
ms "
f
"|
{
avg_time_triton_kernel
:.
5
f
}
ms "
f
"|
{
avg_time_cuda_kernel
:.
5
f
}
ms "
f
"|
{
performance_improved
:.
4
f
}
x |"
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_BATCH_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_query_heads"
,
NUM_QUERY_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"output_dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_merge_attn_states
(
num_tokens
:
int
,
num_query_heads
:
int
,
head_size
:
int
,
output_dtype
:
torch
.
dtype
):
if
not
current_platform
.
is_cuda
():
pytest
.
skip
(
'Currently only support compare triton merge_attn_states '
'with custom cuda merge_attn_states kernel'
)
NUM_TOKENS
=
num_tokens
NUM_HEADS
=
num_query_heads
HEAD_SIZE
=
head_size
print
(
f
"
\n
NUM_TOKENS:
{
NUM_TOKENS
}
, NUM_HEADS:
{
NUM_HEADS
}
, "
f
"HEAD_SIZE:
{
HEAD_SIZE
}
, DTYPE:
{
output_dtype
}
, "
f
"Device:
{
current_platform
.
get_device_name
()
}
"
)
# prefix_lse and suffix_lse contain inf and normal values
prefix_lse
=
torch
.
randn
(
NUM_HEADS
,
NUM_TOKENS
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
suffix_lse
=
torch
.
randn
(
NUM_HEADS
,
NUM_TOKENS
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
# Generate boolean masks
mask_prefix
=
torch
.
rand
(
NUM_HEADS
,
NUM_TOKENS
)
<
0.1
mask_suffix
=
torch
.
rand
(
NUM_HEADS
,
NUM_TOKENS
)
<
0.1
# Ensure that the same position is not True at the same time
combined_mask
=
torch
.
logical_and
(
mask_prefix
,
mask_suffix
)
mask_prefix
=
torch
.
logical_and
(
mask_prefix
,
~
combined_mask
)
mask_suffix
=
torch
.
logical_and
(
mask_suffix
,
~
combined_mask
)
prefix_lse
[
mask_prefix
]
=
float
(
'inf'
)
suffix_lse
[
mask_suffix
]
=
float
(
'inf'
)
# Other input tensors (need to be initialized but
# no actual calculation needed)
output
=
torch
.
zeros
((
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
),
dtype
=
output_dtype
,
device
=
"cuda"
)
output_lse
=
torch
.
zeros
((
NUM_HEADS
,
NUM_TOKENS
),
dtype
=
torch
.
float32
,
device
=
"cuda"
)
prefix_output
=
torch
.
randn
((
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
),
dtype
=
output_dtype
,
device
=
"cuda"
)
suffix_output
=
torch
.
randn
((
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
),
dtype
=
output_dtype
,
device
=
"cuda"
)
warmup_times
=
2
repeat_times
=
20
output_torch
=
output
.
clone
()
output_lse_torch
=
output_lse
.
clone
()
total_time_torch_kernel
=
0
start
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
# 0. Run the Torch kernel
prefix_lse_torch
=
prefix_lse
.
clone
()
suffix_lse_torch
=
suffix_lse
.
clone
()
for
_
in
range
(
warmup_times
):
output_torch
,
output_lse_torch
=
merge_attn_states_torch
(
output_torch
,
prefix_output
,
prefix_lse_torch
,
suffix_output
,
suffix_lse_torch
,
output_lse_torch
)
torch
.
cuda
.
synchronize
()
for
_
in
range
(
repeat_times
):
start
.
record
()
output_torch
,
output_lse_torch
=
merge_attn_states_torch
(
output_torch
,
prefix_output
,
prefix_lse_torch
,
suffix_output
,
suffix_lse_torch
,
output_lse_torch
)
end
.
record
()
torch
.
cuda
.
synchronize
()
total_time_torch_kernel
+=
start
.
elapsed_time
(
end
)
avg_time_torch_kernel
=
total_time_torch_kernel
/
repeat_times
# 1. Run the Triton kernel
output_ref_triton
=
output
.
clone
()
output_lse_ref_triton
=
output_lse
.
clone
()
total_time_triton_kernel
=
0
start
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
for
_
in
range
(
warmup_times
):
merge_attn_states_triton
(
output_ref_triton
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_ref_triton
)
torch
.
cuda
.
synchronize
()
for
_
in
range
(
repeat_times
):
start
.
record
()
merge_attn_states_triton
(
output_ref_triton
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_ref_triton
)
end
.
record
()
torch
.
cuda
.
synchronize
()
total_time_triton_kernel
+=
start
.
elapsed_time
(
end
)
avg_time_triton_kernel
=
total_time_triton_kernel
/
repeat_times
# 2. Run the CUDA kernel
total_time_cuda_kernel
=
0
output_cuda
=
output
.
clone
()
output_lse_cuda
=
output_lse
.
clone
()
for
_
in
range
(
warmup_times
):
merge_attn_states_cuda
(
output_cuda
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_cuda
)
torch
.
cuda
.
synchronize
()
for
_
in
range
(
repeat_times
):
start
.
record
()
merge_attn_states_cuda
(
output_cuda
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_cuda
)
end
.
record
()
torch
.
cuda
.
synchronize
()
total_time_cuda_kernel
+=
start
.
elapsed_time
(
end
)
avg_time_cuda_kernel
=
total_time_cuda_kernel
/
repeat_times
# 3. Performance compare
performance_improved
=
avg_time_triton_kernel
/
avg_time_cuda_kernel
print
(
f
" Torch time:
{
avg_time_torch_kernel
:.
6
f
}
ms"
)
print
(
f
"Triton time:
{
avg_time_triton_kernel
:.
6
f
}
ms"
)
print
(
f
" CUDA time:
{
avg_time_cuda_kernel
:.
6
f
}
ms, "
f
"Performance:
{
performance_improved
:.
5
f
}
x"
)
print
(
"-"
*
100
)
# 4. Correctness compare
# Liger Kernel: Efficient Triton Kernels for LLM Training
# https://arxiv.org/pdf/2410.10989, 3.3 Correctness
# use rtol = 1e-2 for bfloat16.
rtol
=
1e-2
if
output_dtype
==
torch
.
bfloat16
else
1e-3
def
diff
(
a
:
torch
.
Tensor
,
b
:
torch
.
Tensor
):
max_diff
=
torch
.
max
(
torch
.
abs
(
a
.
float
()
-
b
.
float
()))
return
max_diff
# Use Triton output as reference because we want to replace
# the Triton kernel with custom CUDA kernel for merge attn
# states operation.
output_ref
=
output_ref_triton
output_lse_ref
=
output_lse_ref_triton
torch
.
testing
.
assert_close
(
output_cuda
.
float
(),
output_ref
.
float
(),
atol
=
1e-3
,
rtol
=
rtol
)
print
(
"Output all match, max abs diff:"
)
print
(
f
"(Triton vs Torch) :
{
diff
(
output_torch
,
output_ref
)
}
"
)
print
(
f
" (CUDA vs Torch) :
{
diff
(
output_torch
,
output_cuda
)
}
"
)
print
(
f
" (CUDA vs Triton):
{
diff
(
output_ref
,
output_cuda
)
}
"
)
print
(
"-"
*
100
)
torch
.
testing
.
assert_close
(
output_lse_cuda
.
float
(),
output_lse_ref
.
float
(),
atol
=
1e-3
,
rtol
=
rtol
)
print
(
"Output LSE all match, max abs diff:"
)
print
(
f
"(Triton vs Torch) :
{
diff
(
output_lse_torch
,
output_lse_ref
)
}
"
)
print
(
f
" (CUDA vs Torch) :
{
diff
(
output_lse_torch
,
output_lse_cuda
)
}
"
)
print
(
f
" (CUDA vs Triton):
{
diff
(
output_lse_ref
,
output_lse_cuda
)
}
"
)
print
(
"-"
*
100
)
print
(
"All output values test passed! All inf values "
"are correctly replaced with -inf."
)
print
(
"-"
*
100
)
device
=
current_platform
.
get_device_name
()
all_case_info
.
append
(
(
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
,
output_dtype
,
device
,
avg_time_torch_kernel
,
avg_time_triton_kernel
,
avg_time_cuda_kernel
,
performance_improved
))
if
len
(
all_case_info
)
==
(
len
(
NUM_BATCH_TOKENS
)
*
len
(
HEAD_SIZES
)
*
len
(
NUM_QUERY_HEADS
)
*
len
(
DTYPES
)):
generate_markdown_table
()
tests/kernels/test_triton_moe_ptpc_fp8.py
0 → 100644
View file @
9c4ecf15
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
import
itertools
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.platforms
import
current_platform
if
current_platform
.
get_device_capability
()
<
(
9
,
0
):
pytest
.
skip
(
"FP8 Triton requires CUDA 9.0 or higher"
,
allow_module_level
=
True
)
def
native_w8a8_per_token_matmul
(
A
,
B
,
As
,
Bs
,
output_dtype
=
torch
.
float16
):
"""Matrix multiplication function that supports per-token input
quantization and per-column weight quantization"""
A
=
A
.
to
(
torch
.
float32
)
B
=
B
.
to
(
torch
.
float32
)
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
],
"Dimension mismatch"
assert
B
.
ndim
==
2
and
B
.
is_contiguous
(
),
"B must be a 2D contiguous tensor"
# Reshape input
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
B
=
B
.
t
()
# Transpose weight matrix
N
,
K
=
B
.
shape
origin_C_shape
=
A
.
shape
[:
-
1
]
+
(
K
,
)
A
=
A
.
reshape
(
M
,
N
)
# As is per-token [M, 1], Bs is per-column [1, K]
C
=
torch
.
matmul
(
A
,
B
)
# [M, K]
C
=
As
*
C
*
Bs
.
view
(
1
,
-
1
)
# Broadcast per-column scale
return
C
.
reshape
(
origin_C_shape
).
to
(
output_dtype
)
def
fp8_mask
(
a
,
mask
):
dtype
=
a
.
dtype
return
a
.
view
(
torch
.
int8
)[
mask
].
view
(
dtype
)
def
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
):
"""This function performs fused moe with per-column int8
quantization using native torch."""
B
,
D
=
a
.
shape
# Perform per-token quantization
a_q
,
a_s
=
ops
.
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
# Repeat tokens to match topk
a_q
=
a_q
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
# Also repeat the scale
a_s
=
a_s
.
view
(
B
,
-
1
,
1
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
1
)
# [B*topk, 1]
out
=
torch
.
zeros
(
B
*
topk
,
w2
.
shape
[
1
],
dtype
=
a
.
dtype
,
device
=
a
.
device
)
# Calculate routing
score
=
torch
.
softmax
(
score
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk
)
topk_weight
=
topk_weight
.
view
(
-
1
)
topk_ids
=
topk_ids
.
view
(
-
1
)
# Process each expert
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
# First MLP layer: note that a_s is now per-token
inter_out
=
native_w8a8_per_token_matmul
(
fp8_mask
(
a_q
,
mask
),
w1
[
i
],
fp8_mask
(
a_s
,
mask
),
w1_s
[
i
],
output_dtype
=
a
.
dtype
,
)
# Activation function
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
# Quantize activation output with per-token
act_out_q
,
act_out_s
=
ops
.
scaled_fp8_quant
(
act_out
,
use_per_token_if_dynamic
=
True
)
# Second MLP layer
out
[
mask
]
=
native_w8a8_per_token_matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
output_dtype
=
a
.
dtype
)
# Apply routing weights and sum
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"module"
)
def
setup_cuda
():
"""Sets the default CUDA device for all tests in this module."""
torch
.
set_default_device
(
"cuda"
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
M
=
[
1
,
33
]
N
=
[
128
,
1024
]
K
=
[
256
,
4096
]
E
=
[
8
]
TOP_KS
=
[
2
,
6
]
SEEDS
=
[
0
]
@
pytest
.
mark
.
parametrize
(
"M, N, K, E, topk, dtype, seed"
,
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_fp8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
dtype
,
seed
):
torch
.
manual_seed
(
seed
)
# Initialize int8 quantization parameters
factor_for_scale
=
1e-2
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
=
finfo
.
max
fp8_min
=
finfo
.
min
# Input tensor
# M * K
a
=
torch
.
randn
((
M
,
K
),
dtype
=
dtype
)
/
10
# Generate int8 weights
w1_fp32
=
(
torch
.
rand
((
E
,
2
*
N
,
K
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w1
=
(
w1_fp32
*
fp8_max
).
clamp
(
min
=
fp8_min
,
max
=
fp8_max
).
to
(
torch
.
float8_e4m3fn
)
w2_fp32
=
(
torch
.
rand
((
E
,
K
,
N
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w2
=
(
w2_fp32
*
fp8_max
).
clamp
(
min
=
fp8_min
,
max
=
fp8_max
).
to
(
torch
.
float8_e4m3fn
)
# Generate scale for each column (per-column quantization)
w1_s
=
torch
.
rand
(
E
,
2
*
N
,
device
=
w1_fp32
.
device
)
*
factor_for_scale
w2_s
=
torch
.
rand
(
E
,
K
,
device
=
w2_fp32
.
device
)
*
factor_for_scale
score
=
torch
.
randn
((
M
,
E
),
dtype
=
dtype
)
ref_out
=
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
)
out
=
fused_moe
(
a
,
w1
,
w2
,
score
,
topk
,
renormalize
=
False
,
use_fp8_w8a8
=
True
,
# using fp8
per_channel_quant
=
True
,
w1_scale
=
w1_s
,
w2_scale
=
w2_s
,
block_shape
=
None
,
# Not using block quantization
)
# Check results
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.05
tests/kernels/utils_block.py
0 → 100644
View file @
9c4ecf15
# SPDX-License-Identifier: Apache-2.0
import
torch
def
native_w8a8_block_matmul
(
A
:
torch
.
Tensor
,
B
:
torch
.
Tensor
,
As
:
torch
.
Tensor
,
Bs
:
torch
.
Tensor
,
block_size
,
output_dtype
):
"""This function performs matrix multiplication with block-wise
quantization using native torch.
It is agnostic to the input data type and can be used for both int8 and
fp8 data types.
It takes two input tensors `A` and `B` (int8) with scales `As` and
`Bs` (float32).
The output is returned in the specified `output_dtype`.
"""
A
=
A
.
to
(
torch
.
float32
)
B
=
B
.
to
(
torch
.
float32
)
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
]
assert
B
.
ndim
==
2
and
B
.
is_contiguous
()
and
Bs
.
ndim
==
2
assert
len
(
block_size
)
==
2
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
assert
(
A
.
shape
[
-
1
]
+
block_k
-
1
)
//
block_k
==
As
.
shape
[
-
1
]
assert
A
.
shape
[:
-
1
]
==
As
.
shape
[:
-
1
]
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
N
,
K
=
B
.
shape
origin_C_shape
=
A
.
shape
[:
-
1
]
+
(
N
,
)
A
=
A
.
reshape
(
M
,
A
.
shape
[
-
1
])
As
=
As
.
reshape
(
M
,
As
.
shape
[
-
1
])
n_tiles
=
(
N
+
block_n
-
1
)
//
block_n
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
assert
n_tiles
==
Bs
.
shape
[
0
]
assert
k_tiles
==
Bs
.
shape
[
1
]
C_shape
=
(
M
,
N
)
C
=
torch
.
zeros
(
C_shape
,
dtype
=
torch
.
float32
,
device
=
A
.
device
)
A_tiles
=
[
A
[:,
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
)]
for
i
in
range
(
k_tiles
)
]
B_tiles
=
[[
B
[
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
),
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
),
]
for
i
in
range
(
k_tiles
)
]
for
j
in
range
(
n_tiles
)]
C_tiles
=
[
C
[:,
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
)]
for
j
in
range
(
n_tiles
)
]
As_tiles
=
[
As
[:,
i
:
i
+
1
]
for
i
in
range
(
k_tiles
)]
for
i
in
range
(
k_tiles
):
for
j
in
range
(
n_tiles
):
a
=
A_tiles
[
i
]
b
=
B_tiles
[
j
][
i
]
c
=
C_tiles
[
j
]
s
=
As_tiles
[
i
]
*
Bs
[
j
][
i
]
c
[:,
:]
+=
torch
.
matmul
(
a
,
b
.
t
())
*
s
C
=
C
.
reshape
(
origin_C_shape
).
to
(
output_dtype
)
return
C
tests/lora/conftest.py
View file @
9c4ecf15
...
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
...
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
yield
@
pytest
.
fixture
def
reset_default_device
():
"""
Some tests, such as `test_punica_ops.py`, explicitly set the
default device, which can affect subsequent tests. Adding this fixture
helps avoid this problem.
"""
original_device
=
torch
.
get_default_device
()
yield
torch
.
set_default_device
(
original_device
)
tests/lora/test_baichuan.py
View file @
9c4ecf15
...
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
...
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_loras
=
4
,
max_lora_rank
=
64
,
max_lora_rank
=
64
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
fully_sharded_loras
=
fully_sharded
)
fully_sharded_loras
=
fully_sharded
)
output_tp1
=
do_sample
(
llm_tp1
,
baichuan_lora_files
,
lora_id
=
1
)
output_tp1
=
do_sample
(
llm_tp1
,
baichuan_lora_files
,
lora_id
=
1
)
...
...
tests/lora/test_chatglm3_tp.py
View file @
9c4ecf15
...
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
...
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
enable_lora
=
True
,
enable_lora
=
True
,
max_loras
=
4
,
max_loras
=
4
,
max_lora_rank
=
64
,
max_lora_rank
=
64
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
...
...
tests/lora/test_layers.py
View file @
9c4ecf15
...
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
...
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
clean_cache
(
):
def
clean_cache
_reset_device
(
reset_default_device
):
# Release any memory we might be holding on to. CI runs OOMs otherwise.
# Release any memory we might be holding on to. CI runs OOMs otherwise.
from
vllm.lora.ops.triton_ops.utils
import
(
_LORA_A_PTR_DICT
,
from
vllm.lora.ops.triton_ops.utils
import
(
_LORA_A_PTR_DICT
,
_LORA_B_PTR_DICT
)
_LORA_B_PTR_DICT
)
...
...
tests/lora/test_llama_tp.py
View file @
9c4ecf15
...
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
...
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
# also test odd max_num_seqs
# also test odd max_num_seqs
max_num_seqs
=
13
,
max_num_seqs
=
13
,
max_loras
=
4
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
generate_and_test
(
llm
,
sql_lora_files
)
generate_and_test
(
llm
,
sql_lora_files
)
...
...
tests/lora/test_punica_ops.py
View file @
9c4ecf15
...
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
...
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
from
.utils
import
PunicaTensors
,
assert_close
,
generate_data_for_nslices
from
.utils
import
PunicaTensors
,
assert_close
,
generate_data_for_nslices
@
pytest
.
fixture
(
autouse
=
True
)
def
reset_device
(
reset_default_device
):
pass
# Utility shrink and expand operations used as reference implementations.
# Utility shrink and expand operations used as reference implementations.
def
sgmv_shrink_for_nslices
(
def
sgmv_shrink_for_nslices
(
nslices
:
int
,
inputs_tensor
:
torch
.
Tensor
,
nslices
:
int
,
inputs_tensor
:
torch
.
Tensor
,
...
...
tests/lora/test_quant_model.py
View file @
9c4ecf15
...
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
...
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_quant_model_lora
(
tinyllama_lora_files
,
model
):
def
test_quant_model_lora
(
tinyllama_lora_files
,
num_gpus_available
,
model
,
tp_size
):
if
num_gpus_available
<
tp_size
and
\
tp_size
>
1
and
current_platform
.
is_cuda_alike
():
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
llm
=
vllm
.
LLM
(
llm
=
vllm
.
LLM
(
model
=
model
.
model_path
,
model
=
model
.
model_path
,
...
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
...
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_loras
=
4
,
max_model_len
=
400
,
max_model_len
=
400
,
tensor_parallel_size
=
tp_size
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
...
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
enable_lora
=
True
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
...
tests/lora/test_transfomers_model.py
View file @
9c4ecf15
...
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
...
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
enable_lora
=
True
,
enable_lora
=
True
,
max_loras
=
4
,
max_loras
=
4
,
max_lora_rank
=
16
,
max_lora_rank
=
16
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
...
...
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
9c4ecf15
...
@@ -12,6 +12,7 @@ from vllm.sequence import SampleLogprobs
...
@@ -12,6 +12,7 @@ from vllm.sequence import SampleLogprobs
from
....conftest
import
HfRunner
,
VllmRunner
from
....conftest
import
HfRunner
,
VllmRunner
from
....utils
import
RemoteOpenAIServer
from
....utils
import
RemoteOpenAIServer
from
...registry
import
HF_EXAMPLE_MODELS
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
MODEL_NAME
=
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
MODEL_NAME
=
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
...
@@ -55,7 +56,10 @@ def server(request, audio_assets):
...
@@ -55,7 +56,10 @@ def server(request, audio_assets):
for
key
,
value
in
request
.
param
.
items
()
for
key
,
value
in
request
.
param
.
items
()
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
{
"VLLM_AUDIO_FETCH_TIMEOUT"
:
"30"
})
as
remote_server
:
yield
remote_server
yield
remote_server
...
@@ -106,6 +110,10 @@ def run_test(
...
@@ -106,6 +110,10 @@ def run_test(
**
kwargs
,
**
kwargs
,
):
):
"""Inference result should be the same between hf and vllm."""
"""Inference result should be the same between hf and vllm."""
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# NOTE: take care of the order. run vLLM first, and then run HF.
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# if we run HF first, the cuda initialization will be done and it
...
@@ -156,6 +164,10 @@ def run_multi_audio_test(
...
@@ -156,6 +164,10 @@ def run_multi_audio_test(
num_logprobs
:
int
,
num_logprobs
:
int
,
**
kwargs
,
**
kwargs
,
):
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
enforce_eager
=
True
,
enforce_eager
=
True
,
...
...
tests/models/decoder_only/language/test_gguf.py
View file @
9c4ecf15
...
@@ -9,11 +9,13 @@ from typing import NamedTuple
...
@@ -9,11 +9,13 @@ from typing import NamedTuple
import
pytest
import
pytest
from
huggingface_hub
import
hf_hub_download
from
huggingface_hub
import
hf_hub_download
from
pytest
import
MarkDecorator
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
tests.quantization.utils
import
is_quant_method_supported
from
tests.quantization.utils
import
is_quant_method_supported
from
....conftest
import
VllmRunner
from
....conftest
import
VllmRunner
from
....utils
import
multi_gpu_test
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
...
@@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple):
...
@@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple):
original_model
:
str
original_model
:
str
gguf_repo
:
str
gguf_repo
:
str
gguf_filename
:
str
gguf_filename
:
str
marks
:
list
[
MarkDecorator
]
=
[]
@
property
@
property
def
gguf_model
(
self
):
def
gguf_model
(
self
):
...
@@ -35,6 +38,7 @@ LLAMA_CONFIG = GGUFTestConfig(
...
@@ -35,6 +38,7 @@ LLAMA_CONFIG = GGUFTestConfig(
original_model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
original_model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
gguf_repo
=
"bartowski/Llama-3.2-1B-Instruct-GGUF"
,
gguf_repo
=
"bartowski/Llama-3.2-1B-Instruct-GGUF"
,
gguf_filename
=
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"
,
gguf_filename
=
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"
,
marks
=
[
pytest
.
mark
.
quant_model
],
)
)
QWEN2_CONFIG
=
GGUFTestConfig
(
QWEN2_CONFIG
=
GGUFTestConfig
(
...
@@ -81,34 +85,24 @@ MODELS = [
...
@@ -81,34 +85,24 @@ MODELS = [
]
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gguf"
),
def
check_model_outputs
(
reason
=
"gguf is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
def
test_models
(
num_gpus_available
:
int
,
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
example_
prompts
:
list
[
str
],
prompts
:
list
[
str
],
model
:
GGUFTestConfig
,
model
:
GGUFTestConfig
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
tp_size
:
int
,
tp_size
:
int
,
)
->
None
:
):
if
num_gpus_available
<
tp_size
:
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
.
original_model
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
.
original_model
)
if
tokenizer
.
chat_template
is
not
None
:
if
tokenizer
.
chat_template
is
not
None
:
messages
=
[[{
messages
=
[[{
'role'
:
'user'
,
'role'
:
'user'
,
'content'
:
prompt
'content'
:
prompt
}]
for
prompt
in
example_prompts
]
}]
for
prompt
in
prompts
]
example_prompts
=
tokenizer
.
apply_chat_template
(
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
tokenize
=
False
,
add_generation_prompt
=
True
)
# Run gguf model.
# Run gguf model.
with
vllm_runner
(
model_name
=
model
.
gguf_model
,
with
vllm_runner
(
model_name
=
model
.
gguf_model
,
...
@@ -118,17 +112,19 @@ def test_models(
...
@@ -118,17 +112,19 @@ def test_models(
max_model_len
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tp_size
)
as
gguf_model
:
tensor_parallel_size
=
tp_size
)
as
gguf_model
:
gguf_outputs
=
gguf_model
.
generate_greedy_logprobs
(
gguf_outputs
=
gguf_model
.
generate_greedy_logprobs
(
example_
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
# Run unquantized model.
# Run unquantized model.
# Should run with tp=1, otherwise the test will stuck at
# nccl initialization.
with
vllm_runner
(
with
vllm_runner
(
model_name
=
model
.
original_model
,
model_name
=
model
.
original_model
,
enforce_eager
=
True
,
# faster tests
enforce_eager
=
True
,
# faster tests
dtype
=
dtype
,
dtype
=
dtype
,
max_model_len
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tp_size
)
as
original_model
:
tensor_parallel_size
=
1
)
as
original_model
:
original_outputs
=
original_model
.
generate_greedy_logprobs
(
original_outputs
=
original_model
.
generate_greedy_logprobs
(
example_
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
original_outputs
,
outputs_0_lst
=
original_outputs
,
...
@@ -136,3 +132,47 @@ def test_models(
...
@@ -136,3 +132,47 @@ def test_models(
name_0
=
"original"
,
name_0
=
"original"
,
name_1
=
"gguf"
,
name_1
=
"gguf"
,
)
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gguf"
),
reason
=
"gguf is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
pytest
.
param
(
test_config
,
marks
=
test_config
.
marks
)
for
test_config
in
MODELS
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_models
(
vllm_runner
:
type
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
GGUFTestConfig
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tp_size
:
int
,
)
->
None
:
check_model_outputs
(
vllm_runner
,
example_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
tp_size
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gguf"
),
reason
=
"gguf is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
LLAMA_CONFIG
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_distributed
(
vllm_runner
:
type
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
GGUFTestConfig
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tp_size
:
int
,
)
->
None
:
check_model_outputs
(
vllm_runner
,
example_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
tp_size
)
tests/models/decoder_only/vision_language/test_models.py
View file @
9c4ecf15
...
@@ -160,17 +160,32 @@ VLM_TEST_SETTINGS = {
...
@@ -160,17 +160,32 @@ VLM_TEST_SETTINGS = {
),
),
"aya_vision"
:
VLMTestInfo
(
"aya_vision"
:
VLMTestInfo
(
models
=
[
"CohereForAI/aya-vision-8b"
],
models
=
[
"CohereForAI/aya-vision-8b"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
{
img_prompt
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
,
# noqa: E501
prompt_formatter
=
lambda
img_prompt
:
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
{
img_prompt
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>What's the content in the center of the image?"
,
# noqa: E501
"stop_sign"
:
"<image>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>What is the season?"
,
# noqa: E501
"cherry_blossom"
:
"<image>What is the season?"
,
# noqa: E501
}),
}),
multi_image_prompt
=
"<image><image>Describe the two images in detail."
,
# noqa: E501
multi_image_prompt
=
"<image><image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
8192
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"crop_to_patches"
:
True
}},
),
"aya_vision-multi_image"
:
VLMTestInfo
(
models
=
[
"CohereForAI/aya-vision-8b"
],
test_type
=
(
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
{
img_prompt
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>What is the season?"
,
# noqa: E501
}),
multi_image_prompt
=
"<image><image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"crop_to_patches"
:
True
}}
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"crop_to_patches"
:
True
}},
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
),
"blip2"
:
VLMTestInfo
(
"blip2"
:
VLMTestInfo
(
# TODO: Change back to 2.7b once head_dim = 80 is supported
# TODO: Change back to 2.7b once head_dim = 80 is supported
...
@@ -303,6 +318,21 @@ VLM_TEST_SETTINGS = {
...
@@ -303,6 +318,21 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos
=
True
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
),
),
"llama4"
:
VLMTestInfo
(
models
=
[
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
],
prompt_formatter
=
lambda
img_prompt
:
f
"<|begin_of_text|><|header_start|>user<|header_end|>
\n\n
{
img_prompt
}
<|eot|><|header_start|>assistant<|header_end|>
\n\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
_
:
"<|image|>"
,
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
distributed_executor_backend
=
"mp"
,
image_size_factors
=
[(.
25
,
0.5
,
1.0
)],
hf_model_kwargs
=
{
"device_map"
:
"auto"
},
max_model_len
=
8192
,
max_num_seqs
=
4
,
dtype
=
"bfloat16"
,
auto_cls
=
AutoModelForImageTextToText
,
tensor_parallel_size
=
4
,
marks
=
multi_gpu_marks
(
num_gpus
=
4
),
),
"llava_next"
:
VLMTestInfo
(
"llava_next"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
...
@@ -395,23 +425,20 @@ VLM_TEST_SETTINGS = {
...
@@ -395,23 +425,20 @@ VLM_TEST_SETTINGS = {
max_num_seqs
=
2
,
max_num_seqs
=
2
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
),
),
# Tests for phi3v currently live in another file because of a bug in
"phi3v"
:
VLMTestInfo
(
# transformers. Once this issue is fixed, we can enable them here instead.
models
=
[
"microsoft/Phi-3.5-vision-instruct"
],
# https://github.com/huggingface/transformers/issues/34307
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
# "phi3v": VLMTestInfo(
prompt_formatter
=
lambda
img_prompt
:
f
"<|user|>
\n
{
img_prompt
}
<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
# models=["microsoft/Phi-3.5-vision-instruct"],
img_idx_to_prompt
=
lambda
idx
:
f
"<|image_
{
idx
}
|>
\n
"
,
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
max_model_len
=
4096
,
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
max_num_seqs
=
2
,
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
task
=
"generate"
,
# max_model_len=4096,
# use sdpa mode for hf runner since phi3v didn't work with flash_attn
# max_num_seqs=2,
hf_model_kwargs
=
{
"_attn_implementation"
:
"sdpa"
},
# task="generate",
use_tokenizer_eos
=
True
,
# # use eager mode for hf runner since phi3v didn't work with flash_attn
vllm_output_post_proc
=
model_utils
.
phi3v_vllm_to_hf_output
,
# hf_model_kwargs={"_attn_implementation": "eager"},
num_logprobs
=
10
,
# use_tokenizer_eos=True,
),
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
# num_logprobs=10,
# ),
"pixtral_hf"
:
VLMTestInfo
(
"pixtral_hf"
:
VLMTestInfo
(
models
=
[
"nm-testing/pixtral-12b-FP8-dynamic"
],
models
=
[
"nm-testing/pixtral-12b-FP8-dynamic"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
...
@@ -463,6 +490,16 @@ VLM_TEST_SETTINGS = {
...
@@ -463,6 +490,16 @@ VLM_TEST_SETTINGS = {
patch_hf_runner
=
model_utils
.
skyworkr1v_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
skyworkr1v_patch_hf_runner
,
marks
=
[
large_gpu_mark
(
min_gb
=
80
)],
marks
=
[
large_gpu_mark
(
min_gb
=
80
)],
),
),
"smolvlm"
:
VLMTestInfo
(
models
=
[
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User:
{
img_prompt
}
<end_of_utterance>
\n
Assistant:"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
hf_output_post_proc
=
model_utils
.
smolvlm_trunc_hf_output
,
),
### Tensor parallel / multi-gpu broadcast tests
### Tensor parallel / multi-gpu broadcast tests
"chameleon-broadcast"
:
VLMTestInfo
(
"chameleon-broadcast"
:
VLMTestInfo
(
models
=
[
"facebook/chameleon-7b"
],
models
=
[
"facebook/chameleon-7b"
],
...
...
tests/models/decoder_only/vision_language/test_phi3v.py
deleted
100644 → 0
View file @
bfc2d6f7
# SPDX-License-Identifier: Apache-2.0
import
os
import
re
from
typing
import
Optional
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
...utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|user|>
\n
<|image_1|>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
})
HF_MULTIIMAGE_IMAGE_PROMPT
=
"<|user|>
\n
<|image_1|>
\n
<|image_2|>
\n
Describe these images.<|end|>
\n
<|assistant|>
\n
"
# noqa: E501
models
=
[
"microsoft/Phi-3.5-vision-instruct"
]
def
vllm_to_hf_output
(
vllm_output
:
tuple
[
list
[
int
],
str
,
Optional
[
SampleLogprobs
]],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
_
,
output_str
,
out_logprobs
=
vllm_output
output_str_without_image
=
re
.
sub
(
r
"(<\|image_\d+\|>)+"
,
""
,
output_str
)
assert
output_str_without_image
[
0
]
==
" "
output_str_without_image
=
output_str_without_image
[
1
:]
hf_output_str
=
output_str_without_image
+
"<|end|><|endoftext|>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
hf_output_ids
=
tokenizer
.
encode
(
output_str_without_image
)
assert
hf_output_ids
[
0
]
==
1
hf_output_ids
=
hf_output_ids
[
1
:]
return
hf_output_ids
,
hf_output_str
,
out_logprobs
target_dtype
=
"half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_USE_TRITON_FLASH_ATTN"
]
=
"0"
def
run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
list
[
tuple
[
list
[
str
],
PromptImageInput
]],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
mm_limit
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from
transformers
import
AutoImageProcessor
# noqa: F401
from
transformers
import
AutoProcessor
# noqa: F401
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
)
for
prompts
,
images
in
inputs
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
eos_token_id
=
hf_model
.
processor
.
tokenizer
.
eos_token_id
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
images
=
images
,
eos_token_id
=
eos_token_id
)
for
prompts
,
images
in
inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
model
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
def
test_regression_7840
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_regresion_7840
=
[
([
prompt
],
[
image
])
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)
]
# Regression test for #7840.
run_test
(
hf_runner
,
vllm_runner
,
inputs_regresion_7840
,
model
,
dtype
=
dtype
,
max_tokens
=
128
,
num_logprobs
=
10
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# No image
[],
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
])
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_per_case
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_phi4mm.py
View file @
9c4ecf15
...
@@ -2,18 +2,22 @@
...
@@ -2,18 +2,22 @@
import
os
import
os
import
re
import
re
from
collections.abc
import
Sequence
from
typing
import
Optional
from
typing
import
Optional
import
librosa
import
pytest
import
pytest
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
vllm.assets.image
import
ImageAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptAudioInput
,
PromptImageInput
,
VllmRunner
)
from
....utils
import
large_gpu_test
from
....utils
import
large_gpu_test
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
...
@@ -29,6 +33,8 @@ model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
...
@@ -29,6 +33,8 @@ model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
# we have to manually specify the path of the lora weights.
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
vision_lora_path
=
os
.
path
.
join
(
model_path
,
"vision-lora"
)
speech_question
=
os
.
path
.
join
(
model_path
,
"examples"
,
"what_is_shown_in_this_image.wav"
)
models
=
[
model_path
]
models
=
[
model_path
]
...
@@ -64,7 +70,8 @@ if current_platform.is_rocm():
...
@@ -64,7 +70,8 @@ if current_platform.is_rocm():
def
run_test
(
def
run_test
(
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
inputs
:
list
[
tuple
[
list
[
str
],
PromptImageInput
]],
inputs
:
Sequence
[
tuple
[
list
[
str
],
PromptImageInput
,
Optional
[
PromptAudioInput
]]],
model
:
str
,
model
:
str
,
*
,
*
,
max_model_len
:
int
,
max_model_len
:
int
,
...
@@ -104,28 +111,49 @@ def run_test(
...
@@ -104,28 +111,49 @@ def run_test(
enforce_eager
=
True
,
enforce_eager
=
True
,
)
as
vllm_model
:
)
as
vllm_model
:
lora_request
=
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)
lora_request
=
LoRARequest
(
"vision"
,
1
,
vision_lora_path
)
vllm_model
.
model
.
llm_engine
.
add_lora
(
lora_request
=
lora_request
)
vllm_outputs_per_case
=
[
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
images
=
images
)
images
=
images
,
for
prompts
,
images
in
inputs
audios
=
audios
,
lora_request
=
lora_request
)
for
prompts
,
images
,
audios
in
inputs
]
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"sdpa"
}
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
with
hf_runner
(
model
,
dtype
=
dtype
,
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
model_kwargs
=
hf_model_kwargs
)
as
hf_model
:
eos_token_id
=
hf_model
.
processor
.
tokenizer
.
eos_token_id
hf_processor
=
hf_model
.
processor
eos_token_id
=
hf_processor
.
tokenizer
.
eos_token_id
def
patch_hf_processor
(
*
args
,
text
=
""
,
images
=
None
,
audio
=
None
,
sampling_rate
=
None
,
**
kwargs
):
audios
=
None
if
audio
is
not
None
and
sampling_rate
is
not
None
:
audios
=
[(
audio
,
sampling_rate
)]
return
hf_processor
(
*
args
,
text
=
text
,
images
=
images
,
audios
=
audios
,
**
kwargs
)
hf_model
.
processor
=
patch_hf_processor
hf_outputs_per_case
=
[
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
images
=
images
,
images
=
images
,
audios
=
audios
,
eos_token_id
=
eos_token_id
,
eos_token_id
=
eos_token_id
,
num_logits_to_keep
=
0
)
num_logits_to_keep
=
0
)
for
prompts
,
images
in
inputs
for
prompts
,
images
,
audios
in
inputs
]
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
...
@@ -138,8 +166,6 @@ def run_test(
...
@@ -138,8 +166,6 @@ def run_test(
)
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
"size_factors"
,
...
@@ -151,7 +177,7 @@ def run_test(
...
@@ -151,7 +177,7 @@ def run_test(
# Single-scale, batched
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
[
1.0
,
1.0
,
1.0
],
# Multi-scale
# Multi-scale
[
0.
7
,
0.
7
5
,
1.0
],
[
0.
25
,
0.5
,
1.0
],
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
...
@@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
...
@@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
inputs_per_image
=
[(
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
None
,
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
run_test
(
run_test
(
...
@@ -201,17 +228,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
...
@@ -201,17 +228,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
10000
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
10000
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
xfail
(
reason
=
"Phi-4-MM multi-image inference is divergent with hf model."
)
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
size_factors
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
inputs_per_case
=
[
inputs_per_case
=
[
([
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
(
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
[
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
for
factor
in
size_factors
])
[[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
],
None
,
),
]
]
run_test
(
run_test
(
...
@@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
...
@@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
mm_limit
=
2
,
mm_limit
=
2
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
)
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_model_len"
,
[
10000
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_vision_speech_models
(
hf_runner
,
vllm_runner
,
model
,
dtype
:
str
,
max_model_len
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
# use the example speech question so that the model outputs are reasonable
audio
=
librosa
.
load
(
speech_question
,
sr
=
None
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
inputs_vision_speech
=
[
(
[
"<|user|><|image_1|><|audio_1|><|end|><|assistant|>"
],
[
image
],
[
audio
],
),
]
run_test
(
hf_runner
,
vllm_runner
,
inputs_vision_speech
,
model
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
mm_limit
=
1
,
tensor_parallel_size
=
1
,
)
tests/models/decoder_only/vision_language/test_pixtral.py
View file @
9c4ecf15
...
@@ -176,6 +176,8 @@ def test_chat(
...
@@ -176,6 +176,8 @@ def test_chat(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
,
load_format
=
"mistral"
,
config_format
=
"mistral"
,
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
)
as
vllm_model
:
...
@@ -198,22 +200,14 @@ def test_chat(
...
@@ -198,22 +200,14 @@ def test_chat(
@
large_gpu_test
(
min_gb
=
48
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"prompt,expected_ranges"
,
"prompt,expected_ranges"
,
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[(
_create_engine_inputs_hf
(
IMG_URLS
[:
1
]),
[{
[
PlaceholderRange
(
offset
=
11
,
length
=
494
)]),
"offset"
:
11
,
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[
"length"
:
494
PlaceholderRange
(
offset
=
11
,
length
=
266
),
}]),
PlaceholderRange
(
offset
=
277
,
length
=
1056
),
(
_create_engine_inputs_hf
(
IMG_URLS
[
1
:
4
]),
[{
PlaceholderRange
(
offset
=
1333
,
length
=
418
)
"offset"
:
11
,
])])
"length"
:
266
},
{
"offset"
:
277
,
"length"
:
1056
},
{
"offset"
:
1333
,
"length"
:
418
}])])
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
def
test_multi_modal_placeholders
(
vllm_runner
,
prompt
,
expected_ranges
:
list
[
PlaceholderRange
],
expected_ranges
:
list
[
PlaceholderRange
],
monkeypatch
)
->
None
:
monkeypatch
)
->
None
:
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment