Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
31330101
"tests/kernels/moe/untest_moe.py" did not exist on "015fab8c2fa4db8776f7e91abd50371911673d88"
Commit
31330101
authored
Apr 16, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.4' into v0.8.4-dev
parents
e8933c34
dc1b4a6f
Changes
346
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1099 additions
and
134 deletions
+1099
-134
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_prompt_validation.py
+1
-1
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+94
-4
tests/kernels/test_block_fp8.py
tests/kernels/test_block_fp8.py
+19
-73
tests/kernels/test_block_int8.py
tests/kernels/test_block_int8.py
+199
-0
tests/kernels/test_flashmla.py
tests/kernels/test_flashmla.py
+1
-1
tests/kernels/test_int8_kernel.py
tests/kernels/test_int8_kernel.py
+149
-0
tests/kernels/test_merge_attn_states.py
tests/kernels/test_merge_attn_states.py
+265
-0
tests/kernels/test_triton_moe_ptpc_fp8.py
tests/kernels/test_triton_moe_ptpc_fp8.py
+159
-0
tests/kernels/utils_block.py
tests/kernels/utils_block.py
+63
-0
tests/lora/conftest.py
tests/lora/conftest.py
+12
-0
tests/lora/test_baichuan.py
tests/lora/test_baichuan.py
+0
-1
tests/lora/test_chatglm3_tp.py
tests/lora/test_chatglm3_tp.py
+0
-1
tests/lora/test_layers.py
tests/lora/test_layers.py
+1
-1
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+0
-1
tests/lora/test_punica_ops.py
tests/lora/test_punica_ops.py
+5
-0
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+1
-8
tests/lora/test_transfomers_model.py
tests/lora/test_transfomers_model.py
+0
-1
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+13
-2
tests/models/decoder_only/language/test_gguf.py
tests/models/decoder_only/language/test_gguf.py
+60
-20
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+57
-20
No files found.
tests/entrypoints/openai/test_prompt_validation.py
View file @
31330101
...
...
@@ -18,7 +18,7 @@ async def test_empty_prompt():
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
re
.
compile
(
'.+P
rompt cannot be empty
.+'
)
):
match
=
"decoder p
rompt cannot be empty
"
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
""
,
max_tokens
=
5
,
...
...
tests/entrypoints/test_chat_utils.py
View file @
31330101
...
...
@@ -25,15 +25,15 @@ from ..utils import VLLM_PATH
EXAMPLES_DIR
=
VLLM_PATH
/
"examples"
PHI3V_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
ULTRAVOX_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
)
QWEN2AUDIO_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
)
QWEN2VL_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
QWEN25VL_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-VL-3B-Instruct"
)
MLLAMA_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
)
LLAMA_GUARD_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-Guard-3-1B"
)
HERMES_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"NousResearch/Hermes-3-Llama-3.1-8B"
)
MISTRAL_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
phi3v_model_config
():
...
...
@@ -83,6 +83,30 @@ def mllama_tokenizer():
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
mistral_model_config
():
return
ModelConfig
(
MISTRAL_MODEL_ID
,
task
=
"generate"
,
tokenizer
=
MISTRAL_MODEL_ID
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"auto"
,
seed
=
0
,
limit_mm_per_prompt
=
{
"image"
:
2
,
})
@
pytest
.
fixture
(
scope
=
"module"
)
def
mistral_tokenizer
():
return
TokenizerGroup
(
tokenizer_id
=
MISTRAL_MODEL_ID
,
enable_lora
=
False
,
max_num_seqs
=
5
,
max_input_length
=
None
,
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
image_url
():
image
=
ImageAsset
(
'cherry_blossom'
)
...
...
@@ -134,6 +158,66 @@ def test_parse_chat_messages_single_image(
_assert_mm_data_is_image_input
(
mm_data
,
1
)
def
test_parse_chat_messages_empty_system
(
mistral_model_config
,
mistral_tokenizer
,
):
# Test string format
conversation
,
_
=
parse_chat_messages
(
[{
"role"
:
"system"
,
"content"
:
""
},
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
"Who are you?"
}]
}],
mistral_model_config
,
mistral_tokenizer
,
content_format
=
"string"
,
)
assert
conversation
==
[{
"role"
:
"system"
,
"content"
:
""
},
{
"role"
:
"user"
,
"content"
:
"Who are you?"
}]
# Test openai format
conversation
,
_
=
parse_chat_messages
(
[{
"role"
:
"system"
,
"content"
:
""
},
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
"Who are you?"
}]
}],
mistral_model_config
,
mistral_tokenizer
,
content_format
=
"openai"
,
)
assert
conversation
==
[{
"role"
:
"system"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
""
}]
},
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
"Who are you?"
}]
}]
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_single_image_async
(
phi3v_model_config
,
...
...
@@ -674,7 +758,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
# Build a config for the model
model_config
=
ModelConfig
(
model
,
task
=
"generate"
,
tokenizer
=
MLLAMA_MODEL_ID
,
tokenizer
=
model
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"auto"
,
...
...
@@ -685,7 +769,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
# Build the tokenizer group and grab the underlying tokenizer
tokenizer_group
=
TokenizerGroup
(
MLLAMA_MODEL_ID
,
model
,
enable_lora
=
False
,
max_num_seqs
=
5
,
max_input_length
=
None
,
...
...
@@ -759,6 +843,8 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
assert
isinstance
(
chat_template
,
str
)
# NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json`
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"model"
,
"expected_format"
),
...
...
@@ -766,6 +852,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
(
QWEN2VL_MODEL_ID
,
"openai"
),
(
QWEN25VL_MODEL_ID
,
"openai"
),
(
ULTRAVOX_MODEL_ID
,
"string"
),
(
QWEN2AUDIO_MODEL_ID
,
"openai"
),
(
MLLAMA_MODEL_ID
,
"openai"
),
(
LLAMA_GUARD_MODEL_ID
,
"openai"
)],
)
...
...
@@ -818,10 +905,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
(
"template_chatglm2.jinja"
,
"string"
),
(
"template_chatml.jinja"
,
"string"
),
(
"template_deepseek_vl2.jinja"
,
"string"
),
(
"template_dse_qwen2_vl.jinja"
,
"openai"
),
(
"template_falcon_180b.jinja"
,
"string"
),
(
"template_falcon.jinja"
,
"string"
),
(
"template_florence2.jinja"
,
"string"
),
(
"template_inkbot.jinja"
,
"string"
),
(
"template_llava.jinja"
,
"string"
),
(
"template_teleflm.jinja"
,
"string"
),
(
"template_vlm2vec.jinja"
,
"openai"
),
(
"tool_chat_template_granite_20b_fc.jinja"
,
"string"
),
(
"tool_chat_template_hermes.jinja"
,
"string"
),
...
...
tests/kernels/test_block_fp8.py
View file @
31330101
...
...
@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8
,
w8a8_block_fp8_matmul
)
from
vllm.platforms
import
current_platform
from
.utils_block
import
native_w8a8_block_matmul
dg_available
=
False
try
:
import
deep_gemm
...
...
@@ -75,61 +77,6 @@ def native_per_token_group_quant_fp8(x,
return
x_q
,
x_s
def
native_w8a8_block_fp8_matmul
(
A
,
B
,
As
,
Bs
,
block_size
,
output_dtype
=
torch
.
float16
):
"""Matrix multiplication with block-wise quantization using native torch."""
A
=
A
.
to
(
torch
.
float32
)
B
=
B
.
to
(
torch
.
float32
)
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
]
assert
B
.
ndim
==
2
and
B
.
is_contiguous
()
and
Bs
.
ndim
==
2
assert
len
(
block_size
)
==
2
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
assert
(
A
.
shape
[
-
1
]
+
block_k
-
1
)
//
block_k
==
As
.
shape
[
-
1
]
assert
A
.
shape
[:
-
1
]
==
As
.
shape
[:
-
1
]
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
N
,
K
=
B
.
shape
origin_C_shape
=
A
.
shape
[:
-
1
]
+
(
N
,
)
A
=
A
.
reshape
(
M
,
A
.
shape
[
-
1
])
As
=
As
.
reshape
(
M
,
As
.
shape
[
-
1
])
n_tiles
=
(
N
+
block_n
-
1
)
//
block_n
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
assert
n_tiles
==
Bs
.
shape
[
0
]
assert
k_tiles
==
Bs
.
shape
[
1
]
C_shape
=
(
M
,
N
)
C
=
torch
.
zeros
(
C_shape
,
dtype
=
torch
.
float32
,
device
=
A
.
device
)
A_tiles
=
[
A
[:,
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
)]
for
i
in
range
(
k_tiles
)
]
B_tiles
=
[[
B
[
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
),
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
),
]
for
i
in
range
(
k_tiles
)
]
for
j
in
range
(
n_tiles
)]
C_tiles
=
[
C
[:,
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
)]
for
j
in
range
(
n_tiles
)
]
As_tiles
=
[
As
[:,
i
:
i
+
1
]
for
i
in
range
(
k_tiles
)]
for
i
in
range
(
k_tiles
):
for
j
in
range
(
n_tiles
):
a
=
A_tiles
[
i
]
b
=
B_tiles
[
j
][
i
]
c
=
C_tiles
[
j
]
s
=
As_tiles
[
i
]
*
Bs
[
j
][
i
]
c
[:,
:]
+=
torch
.
matmul
(
a
,
b
.
t
())
*
s
C
=
C
.
reshape
(
origin_C_shape
).
to
(
output_dtype
)
return
C
def
torch_w8a8_block_fp8_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
,
block_shape
):
"""Fused moe with block-wise quantization using native torch."""
B
,
D
=
a
.
shape
...
...
@@ -146,22 +93,22 @@ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
inter_out
=
native_w8a8_block_
fp8_
matmul
(
a_q
[
mask
],
w1
[
i
],
a_s
[
mask
],
w1_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
inter_out
=
native_w8a8_block_matmul
(
a_q
[
mask
],
w1
[
i
],
a_s
[
mask
],
w1_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
act_out_q
,
act_out_s
=
native_per_token_group_quant_fp8
(
act_out
,
block_k
)
act_out
=
act_out
.
to
(
torch
.
float32
)
out
[
mask
]
=
native_w8a8_block_
fp8_
matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
out
[
mask
]
=
native_w8a8_block_matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
...
...
@@ -215,8 +162,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
As
=
torch
.
rand
(
M
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
Bs
=
torch
.
rand
(
n_tiles
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
ref_out
=
native_w8a8_block_
fp8_
matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
out
=
w8a8_block_fp8_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
rel_diff
=
(
torch
.
mean
(
...
...
@@ -239,8 +186,6 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
fp8_info
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
,
fp8_min
=
fp8_info
.
max
,
fp8_info
.
min
vllm_config
=
VllmConfig
()
a
=
torch
.
randn
((
M
,
K
),
dtype
=
dtype
)
/
10
w1_bf16
=
(
torch
.
rand
(
...
...
@@ -266,6 +211,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
score
=
torch
.
randn
((
M
,
E
),
dtype
=
dtype
)
# Set the context to avoid lots of warning spam.
vllm_config
=
VllmConfig
()
with
set_current_vllm_config
(
vllm_config
):
out
=
fused_moe
(
a
,
...
...
@@ -334,8 +280,8 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
As
=
As_fp8
.
to
(
torch
.
float32
)
Bs
=
Bs_fp8
.
to
(
torch
.
float32
)
ref_out
=
native_w8a8_block_
fp8_
matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
# Transpose earlier so that the testing will not trigger transposing kernels
As_fp8
=
deep_gemm
.
get_col_major_tma_aligned_tensor
(
As_fp8
)
...
...
tests/kernels/test_block_int8.py
0 → 100644
View file @
31330101
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
import
itertools
import
pytest
import
torch
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.quantization.utils.int8_utils
import
(
w8a8_block_int8_matmul
)
from
vllm.platforms
import
current_platform
from
.utils_block
import
native_w8a8_block_matmul
if
current_platform
.
get_device_capability
()
<
(
7
,
0
):
pytest
.
skip
(
"INT8 Triton requires CUDA 7.0 or higher"
,
allow_module_level
=
True
)
# For test
def
native_per_token_group_quant_int8
(
x
,
group_size
,
eps
=
1e-10
,
dtype
=
torch
.
int8
):
"""Function to perform per-token-group quantization on an input tensor
`x` using native torch.
It converts the tensor values into int8 values and returns the
quantized tensor along with the scaling factor used for quantization.
"""
assert
(
x
.
shape
[
-
1
]
%
group_size
==
0
),
"the last dimension of `x` cannot be divisible by `group_size`"
assert
x
.
is_contiguous
(),
"`x` is not contiguous"
iinfo
=
torch
.
iinfo
(
dtype
)
int8_min
=
iinfo
.
min
int8_max
=
iinfo
.
max
x_
=
x
.
reshape
(
x
.
numel
()
//
group_size
,
group_size
)
# Use float32 for scale calculation for stability
amax
=
x_
.
abs
().
max
(
dim
=-
1
,
keepdim
=
True
)[
0
].
clamp
(
min
=
eps
).
to
(
torch
.
float32
)
x_s
=
amax
/
int8_max
x_q
=
(
x_
.
to
(
torch
.
float32
)
/
x_s
).
round
().
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
dtype
)
# Round before clamping
x_q
=
x_q
.
reshape
(
x
.
shape
)
x_s
=
x_s
.
reshape
(
x
.
shape
[:
-
1
]
+
(
x
.
shape
[
-
1
]
//
group_size
,
))
return
x_q
,
x_s
# For test
def
torch_w8a8_block_int8_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
,
block_shape
):
"""This function performs fused moe with block-wise quantization using
native torch."""
B
,
D
=
a
.
shape
a
=
a
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
out
=
torch
.
zeros
(
B
*
topk
,
w2
.
shape
[
1
],
dtype
=
a
.
dtype
,
device
=
a
.
device
)
score
=
torch
.
softmax
(
score
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk
)
topk_weight
=
topk_weight
.
view
(
-
1
)
topk_ids
=
topk_ids
.
view
(
-
1
)
_
,
block_k
=
block_shape
[
0
],
block_shape
[
1
]
a_q
,
a_s
=
native_per_token_group_quant_int8
(
a
,
block_k
)
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
inter_out
=
native_w8a8_block_matmul
(
a_q
[
mask
],
w1
[
i
],
a_s
[
mask
],
w1_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
act_out_q
,
act_out_s
=
native_per_token_group_quant_int8
(
act_out
,
block_k
)
act_out
=
act_out
.
to
(
torch
.
float32
)
out
[
mask
]
=
native_w8a8_block_matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
block_shape
,
output_dtype
=
a
.
dtype
)
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
M
=
[
1
,
33
,
64
,
222
]
N
=
[
128
,
1024
]
K
=
[
256
,
4096
]
E
=
[
8
,
24
]
TOP_KS
=
[
2
,
6
]
# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
BLOCK_SIZE
=
[[
128
,
128
]]
SEEDS
=
[
0
]
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"module"
)
def
setup_cuda
():
"""Sets the default CUDA device for all tests in this module."""
torch
.
set_default_device
(
"cuda"
)
@
pytest
.
mark
.
parametrize
(
"M,N,K,block_size,out_dtype,seed"
,
itertools
.
product
(
M
,
N
,
K
,
BLOCK_SIZE
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_block_int8_matmul
(
M
,
N
,
K
,
block_size
,
out_dtype
,
seed
):
torch
.
manual_seed
(
seed
)
factor_for_scale
=
1e-2
int8_info
=
torch
.
iinfo
(
torch
.
int8
)
int8_max
,
int8_min
=
int8_info
.
max
,
int8_info
.
min
A_fp32
=
(
torch
.
rand
(
M
,
K
,
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
A_fp8
=
A_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
float8_e4m3fn
)
B_fp32
=
(
torch
.
rand
(
N
,
K
,
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
B_fp8
=
B_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
float8_e4m3fn
)
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
n_tiles
=
(
N
+
block_n
-
1
)
//
block_n
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
As
=
torch
.
rand
(
M
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
Bs
=
torch
.
rand
(
n_tiles
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
out
=
w8a8_block_int8_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.001
@
pytest
.
mark
.
parametrize
(
"M, N, K, E, topk, block_size, dtype, seed"
,
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
BLOCK_SIZE
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_block_int8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
block_size
,
dtype
,
seed
):
"""Tests the fused_moe kernel with W8A8 INT8 block quantization against a
native torch reference."""
torch
.
manual_seed
(
seed
)
# Use a smaller factor for scale initialization to prevent large
# values/overflow especially when output dtype might be float16
factor_for_scale
=
1e-2
int8_info
=
torch
.
iinfo
(
torch
.
int8
)
int8_max
,
int8_min
=
int8_info
.
max
,
int8_info
.
min
a
=
torch
.
randn
((
M
,
K
),
dtype
=
dtype
)
/
10
w1_fp32
=
(
torch
.
rand
(
(
E
,
2
*
N
,
K
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
w1
=
w1_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
w2_fp32
=
(
torch
.
rand
((
E
,
K
,
N
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
int8_max
w2
=
w2_fp32
.
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
n_tiles_w1
=
(
2
*
N
+
block_n
-
1
)
//
block_n
n_tiles_w2
=
(
K
+
block_n
-
1
)
//
block_n
k_tiles_w1
=
(
K
+
block_k
-
1
)
//
block_k
k_tiles_w2
=
(
N
+
block_k
-
1
)
//
block_k
w1_s
=
(
torch
.
rand
(
(
E
,
n_tiles_w1
,
k_tiles_w1
),
dtype
=
torch
.
float32
)
*
factor_for_scale
)
w2_s
=
(
torch
.
rand
(
(
E
,
n_tiles_w2
,
k_tiles_w2
),
dtype
=
torch
.
float32
)
*
factor_for_scale
)
score
=
torch
.
randn
((
M
,
E
),
dtype
=
dtype
)
# Set the context to avoid lots of warning spam.
vllm_config
=
VllmConfig
()
with
set_current_vllm_config
(
vllm_config
):
out
=
fused_moe
(
a
,
w1
,
w2
,
score
,
topk
,
renormalize
=
False
,
use_int8_w8a8
=
True
,
w1_scale
=
w1_s
,
w2_scale
=
w2_s
,
block_shape
=
block_size
,
)
ref_out
=
torch_w8a8_block_int8_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
,
block_size
)
# Check results
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.06
tests/kernels/test_flashmla.py
View file @
31330101
...
...
@@ -124,7 +124,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
cal_diff
(
out_flash
,
out_torch
,
"out"
)
cal_diff
(
lse_flash
,
lse_torch
,
"lse"
)
t
=
triton
.
testing
.
do_bench
(
flash_mla
,
fast_flush
=
False
)
t
=
triton
.
testing
.
do_bench
(
flash_mla
)
FLOPS
=
s_q
*
total_seqlens
*
h_q
*
(
d
+
dv
)
*
2
bytes
=
(
total_seqlens
*
h_kv
*
d
+
b
*
s_q
*
h_q
*
d
+
b
*
s_q
*
h_q
*
dv
)
*
(
torch
.
finfo
(
dtype
).
bits
//
8
)
...
...
tests/kernels/test_int8_kernel.py
0 → 100644
View file @
31330101
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
import
itertools
import
pytest
import
torch
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.quantization.utils.int8_utils
import
(
per_token_quant_int8
)
from
vllm.platforms
import
current_platform
if
current_platform
.
get_device_capability
()
<
(
7
,
0
):
pytest
.
skip
(
"INT8 Triton requires CUDA 7.0 or higher"
,
allow_module_level
=
True
)
def
native_w8a8_per_token_matmul
(
A
,
B
,
As
,
Bs
,
output_dtype
=
torch
.
float16
):
"""Matrix multiplication function that supports per-token input
quantization and per-column weight quantization"""
A
=
A
.
to
(
torch
.
float32
)
B
=
B
.
to
(
torch
.
float32
)
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
],
"Dimension mismatch"
assert
B
.
ndim
==
2
and
B
.
is_contiguous
(
),
"B must be a 2D contiguous tensor"
# Reshape input
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
B
=
B
.
t
()
# Transpose weight matrix
N
,
K
=
B
.
shape
origin_C_shape
=
A
.
shape
[:
-
1
]
+
(
K
,
)
A
=
A
.
reshape
(
M
,
N
)
# As is per-token [M, 1], Bs is per-column [1, K]
C
=
torch
.
matmul
(
A
,
B
)
# [M, K]
C
=
As
*
C
*
Bs
.
view
(
1
,
-
1
)
# Broadcast per-column scale
return
C
.
reshape
(
origin_C_shape
).
to
(
output_dtype
)
def
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
):
"""This function performs fused moe with per-column int8 quantization
using native torch."""
B
,
D
=
a
.
shape
# Perform per-token quantization
a_q
,
a_s
=
per_token_quant_int8
(
a
)
# Repeat tokens to match topk
a_q
=
a_q
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
# Also repeat the scale
a_s
=
a_s
.
view
(
B
,
-
1
,
1
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
1
)
# [B*topk, 1]
out
=
torch
.
zeros
(
B
*
topk
,
w2
.
shape
[
1
],
dtype
=
a
.
dtype
,
device
=
a
.
device
)
# Calculate routing
score
=
torch
.
softmax
(
score
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk
)
topk_weight
=
topk_weight
.
view
(
-
1
)
topk_ids
=
topk_ids
.
view
(
-
1
)
# Process each expert
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
# First MLP layer: note that a_s is now per-token
inter_out
=
native_w8a8_per_token_matmul
(
a_q
[
mask
],
w1
[
i
],
a_s
[
mask
],
w1_s
[
i
],
output_dtype
=
a
.
dtype
)
# Activation function
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
# Quantize activation output with per-token
act_out_q
,
act_out_s
=
per_token_quant_int8
(
act_out
)
# Second MLP layer
out
[
mask
]
=
native_w8a8_per_token_matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
output_dtype
=
a
.
dtype
)
# Apply routing weights and sum
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"module"
)
def
setup_cuda
():
"""Sets the default CUDA device for all tests in this module."""
torch
.
set_default_device
(
"cuda"
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
M
=
[
1
,
33
]
N
=
[
128
,
1024
]
K
=
[
256
,
4096
]
E
=
[
8
]
TOP_KS
=
[
2
,
6
]
SEEDS
=
[
0
]
@
pytest
.
mark
.
parametrize
(
"M, N, K, E, topk, dtype, seed"
,
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_fp8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
dtype
,
seed
):
torch
.
manual_seed
(
seed
)
# Initialize int8 quantization parameters
factor_for_scale
=
1e-2
int8_max
=
127
int8_min
=
-
128
# Input tensor
# M * K
a
=
torch
.
randn
((
M
,
K
),
dtype
=
dtype
)
/
10
# Generate int8 weights
w1_fp32
=
(
torch
.
rand
((
E
,
2
*
N
,
K
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w1
=
(
w1_fp32
*
int8_max
).
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
w2_fp32
=
(
torch
.
rand
((
E
,
K
,
N
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w2
=
(
w2_fp32
*
int8_max
).
clamp
(
min
=
int8_min
,
max
=
int8_max
).
to
(
torch
.
int8
)
# Generate scale for each column (per-column quantization)
w1_s
=
torch
.
rand
(
E
,
2
*
N
,
device
=
w1_fp32
.
device
)
*
factor_for_scale
w2_s
=
torch
.
rand
(
E
,
K
,
device
=
w2_fp32
.
device
)
*
factor_for_scale
score
=
torch
.
randn
((
M
,
E
),
dtype
=
dtype
)
ref_out
=
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
)
out
=
fused_moe
(
a
,
w1
,
w2
,
score
,
topk
,
renormalize
=
False
,
use_int8_w8a8
=
True
,
# Using int8-w8a8
per_channel_quant
=
True
,
w1_scale
=
w1_s
,
w2_scale
=
w2_s
,
block_shape
=
None
,
# Not using block quantization
)
# Check results
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.05
tests/kernels/test_merge_attn_states.py
0 → 100644
View file @
31330101
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
pytest
import
torch
from
vllm._custom_ops
import
merge_attn_states
as
merge_attn_states_cuda
from
vllm.attention.ops.triton_merge_attn_states
import
(
merge_attn_states
as
merge_attn_states_triton
)
from
vllm.platforms
import
current_platform
# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
# can be used to combine partial attention results (in the split-KV case)
def
merge_attn_states_torch
(
output
:
torch
.
Tensor
,
# [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
prefix_output
:
torch
.
Tensor
,
# [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
prefix_lse
:
torch
.
Tensor
,
# [NUM_HEADS, NUM_TOKENS]
suffix_output
:
torch
.
Tensor
,
# [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
suffix_lse
:
torch
.
Tensor
,
# [NUM_HEADS, NUM_TOKENS]
output_lse
:
Optional
[
torch
.
Tensor
]
=
None
,
# [NUM_HEADS, NUM_TOKENS]
):
p_lse
=
prefix_lse
s_lse
=
suffix_lse
# inf -> -inf
p_lse
[
p_lse
==
torch
.
inf
]
=
-
torch
.
inf
s_lse
[
s_lse
==
torch
.
inf
]
=
-
torch
.
inf
# max_lse [NUM_HEADS, NUM_TOKENS]
max_lse
=
torch
.
maximum
(
p_lse
,
s_lse
)
p_lse
=
p_lse
-
max_lse
s_lse
=
s_lse
-
max_lse
p_lse_exp
=
torch
.
exp
(
p_lse
)
s_lse_exp
=
torch
.
exp
(
s_lse
)
out_se
=
(
p_lse_exp
+
s_lse_exp
)
if
output_lse
is
not
None
:
output_lse
=
torch
.
log
(
out_se
)
+
max_lse
p_scale
=
p_lse_exp
/
out_se
# [NUM_HEADS, NUM_TOKENS]
s_scale
=
s_lse_exp
/
out_se
# [NUM_HEADS, NUM_TOKENS]
p_scale
=
torch
.
transpose
(
p_scale
,
0
,
1
).
unsqueeze
(
2
)
# [NUM_TOKENS, NUM_HEADS, 1]
s_scale
=
torch
.
transpose
(
s_scale
,
0
,
1
).
unsqueeze
(
2
)
# [NUM_TOKENS, NUM_HEADS, 1]
output
=
prefix_output
*
p_scale
+
suffix_output
*
s_scale
return
output
,
output_lse
NUM_BATCH_TOKENS
=
[
256
,
512
,
613
,
1024
,
1536
,
4096
]
NUM_QUERY_HEADS
=
[
4
,
8
,
16
,
32
,
48
,
64
]
HEAD_SIZES
=
[
32
,
48
,
64
,
96
,
128
,
256
]
DTYPES
=
[
torch
.
float32
,
torch
.
half
,
torch
.
bfloat16
]
all_case_info
:
list
[
tuple
]
=
[]
def
generate_markdown_table
():
global
all_case_info
table_header
=
(
"| tokens | heads | headsize | dtype "
"| device | torch | triton | cuda | speedup |"
)
table_separator
=
"| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
def
shortly_dtype
(
dtype
:
torch
.
dtype
)
->
str
:
return
str
(
dtype
).
removeprefix
(
"torch."
)
def
shortly_device
(
device
:
str
)
->
str
:
return
device
.
removeprefix
(
"NVIDIA"
).
strip
()
print
(
table_header
)
print
(
table_separator
)
for
info
in
all_case_info
:
(
num_tokens
,
num_heads
,
head_size
,
dtype
,
device
,
avg_time_torch_kernel
,
avg_time_triton_kernel
,
avg_time_cuda_kernel
,
performance_improved
)
=
info
dtype
=
shortly_dtype
(
dtype
)
device
=
shortly_device
(
device
)
print
(
f
"|
{
num_tokens
}
|
{
num_heads
}
|
{
head_size
}
"
f
"|
{
dtype
}
|
{
device
}
|
{
avg_time_torch_kernel
:.
5
f
}
ms "
f
"|
{
avg_time_triton_kernel
:.
5
f
}
ms "
f
"|
{
avg_time_cuda_kernel
:.
5
f
}
ms "
f
"|
{
performance_improved
:.
4
f
}
x |"
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_BATCH_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_query_heads"
,
NUM_QUERY_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"output_dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_merge_attn_states
(
num_tokens
:
int
,
num_query_heads
:
int
,
head_size
:
int
,
output_dtype
:
torch
.
dtype
):
if
not
current_platform
.
is_cuda
():
pytest
.
skip
(
'Currently only support compare triton merge_attn_states '
'with custom cuda merge_attn_states kernel'
)
NUM_TOKENS
=
num_tokens
NUM_HEADS
=
num_query_heads
HEAD_SIZE
=
head_size
print
(
f
"
\n
NUM_TOKENS:
{
NUM_TOKENS
}
, NUM_HEADS:
{
NUM_HEADS
}
, "
f
"HEAD_SIZE:
{
HEAD_SIZE
}
, DTYPE:
{
output_dtype
}
, "
f
"Device:
{
current_platform
.
get_device_name
()
}
"
)
# prefix_lse and suffix_lse contain inf and normal values
prefix_lse
=
torch
.
randn
(
NUM_HEADS
,
NUM_TOKENS
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
suffix_lse
=
torch
.
randn
(
NUM_HEADS
,
NUM_TOKENS
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
# Generate boolean masks
mask_prefix
=
torch
.
rand
(
NUM_HEADS
,
NUM_TOKENS
)
<
0.1
mask_suffix
=
torch
.
rand
(
NUM_HEADS
,
NUM_TOKENS
)
<
0.1
# Ensure that the same position is not True at the same time
combined_mask
=
torch
.
logical_and
(
mask_prefix
,
mask_suffix
)
mask_prefix
=
torch
.
logical_and
(
mask_prefix
,
~
combined_mask
)
mask_suffix
=
torch
.
logical_and
(
mask_suffix
,
~
combined_mask
)
prefix_lse
[
mask_prefix
]
=
float
(
'inf'
)
suffix_lse
[
mask_suffix
]
=
float
(
'inf'
)
# Other input tensors (need to be initialized but
# no actual calculation needed)
output
=
torch
.
zeros
((
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
),
dtype
=
output_dtype
,
device
=
"cuda"
)
output_lse
=
torch
.
zeros
((
NUM_HEADS
,
NUM_TOKENS
),
dtype
=
torch
.
float32
,
device
=
"cuda"
)
prefix_output
=
torch
.
randn
((
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
),
dtype
=
output_dtype
,
device
=
"cuda"
)
suffix_output
=
torch
.
randn
((
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
),
dtype
=
output_dtype
,
device
=
"cuda"
)
warmup_times
=
2
repeat_times
=
20
output_torch
=
output
.
clone
()
output_lse_torch
=
output_lse
.
clone
()
total_time_torch_kernel
=
0
start
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
# 0. Run the Torch kernel
prefix_lse_torch
=
prefix_lse
.
clone
()
suffix_lse_torch
=
suffix_lse
.
clone
()
for
_
in
range
(
warmup_times
):
output_torch
,
output_lse_torch
=
merge_attn_states_torch
(
output_torch
,
prefix_output
,
prefix_lse_torch
,
suffix_output
,
suffix_lse_torch
,
output_lse_torch
)
torch
.
cuda
.
synchronize
()
for
_
in
range
(
repeat_times
):
start
.
record
()
output_torch
,
output_lse_torch
=
merge_attn_states_torch
(
output_torch
,
prefix_output
,
prefix_lse_torch
,
suffix_output
,
suffix_lse_torch
,
output_lse_torch
)
end
.
record
()
torch
.
cuda
.
synchronize
()
total_time_torch_kernel
+=
start
.
elapsed_time
(
end
)
avg_time_torch_kernel
=
total_time_torch_kernel
/
repeat_times
# 1. Run the Triton kernel
output_ref_triton
=
output
.
clone
()
output_lse_ref_triton
=
output_lse
.
clone
()
total_time_triton_kernel
=
0
start
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
for
_
in
range
(
warmup_times
):
merge_attn_states_triton
(
output_ref_triton
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_ref_triton
)
torch
.
cuda
.
synchronize
()
for
_
in
range
(
repeat_times
):
start
.
record
()
merge_attn_states_triton
(
output_ref_triton
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_ref_triton
)
end
.
record
()
torch
.
cuda
.
synchronize
()
total_time_triton_kernel
+=
start
.
elapsed_time
(
end
)
avg_time_triton_kernel
=
total_time_triton_kernel
/
repeat_times
# 2. Run the CUDA kernel
total_time_cuda_kernel
=
0
output_cuda
=
output
.
clone
()
output_lse_cuda
=
output_lse
.
clone
()
for
_
in
range
(
warmup_times
):
merge_attn_states_cuda
(
output_cuda
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_cuda
)
torch
.
cuda
.
synchronize
()
for
_
in
range
(
repeat_times
):
start
.
record
()
merge_attn_states_cuda
(
output_cuda
,
prefix_output
,
prefix_lse
,
suffix_output
,
suffix_lse
,
output_lse_cuda
)
end
.
record
()
torch
.
cuda
.
synchronize
()
total_time_cuda_kernel
+=
start
.
elapsed_time
(
end
)
avg_time_cuda_kernel
=
total_time_cuda_kernel
/
repeat_times
# 3. Performance compare
performance_improved
=
avg_time_triton_kernel
/
avg_time_cuda_kernel
print
(
f
" Torch time:
{
avg_time_torch_kernel
:.
6
f
}
ms"
)
print
(
f
"Triton time:
{
avg_time_triton_kernel
:.
6
f
}
ms"
)
print
(
f
" CUDA time:
{
avg_time_cuda_kernel
:.
6
f
}
ms, "
f
"Performance:
{
performance_improved
:.
5
f
}
x"
)
print
(
"-"
*
100
)
# 4. Correctness compare
# Liger Kernel: Efficient Triton Kernels for LLM Training
# https://arxiv.org/pdf/2410.10989, 3.3 Correctness
# use rtol = 1e-2 for bfloat16.
rtol
=
1e-2
if
output_dtype
==
torch
.
bfloat16
else
1e-3
def
diff
(
a
:
torch
.
Tensor
,
b
:
torch
.
Tensor
):
max_diff
=
torch
.
max
(
torch
.
abs
(
a
.
float
()
-
b
.
float
()))
return
max_diff
# Use Triton output as reference because we want to replace
# the Triton kernel with custom CUDA kernel for merge attn
# states operation.
output_ref
=
output_ref_triton
output_lse_ref
=
output_lse_ref_triton
torch
.
testing
.
assert_close
(
output_cuda
.
float
(),
output_ref
.
float
(),
atol
=
1e-3
,
rtol
=
rtol
)
print
(
"Output all match, max abs diff:"
)
print
(
f
"(Triton vs Torch) :
{
diff
(
output_torch
,
output_ref
)
}
"
)
print
(
f
" (CUDA vs Torch) :
{
diff
(
output_torch
,
output_cuda
)
}
"
)
print
(
f
" (CUDA vs Triton):
{
diff
(
output_ref
,
output_cuda
)
}
"
)
print
(
"-"
*
100
)
torch
.
testing
.
assert_close
(
output_lse_cuda
.
float
(),
output_lse_ref
.
float
(),
atol
=
1e-3
,
rtol
=
rtol
)
print
(
"Output LSE all match, max abs diff:"
)
print
(
f
"(Triton vs Torch) :
{
diff
(
output_lse_torch
,
output_lse_ref
)
}
"
)
print
(
f
" (CUDA vs Torch) :
{
diff
(
output_lse_torch
,
output_lse_cuda
)
}
"
)
print
(
f
" (CUDA vs Triton):
{
diff
(
output_lse_ref
,
output_lse_cuda
)
}
"
)
print
(
"-"
*
100
)
print
(
"All output values test passed! All inf values "
"are correctly replaced with -inf."
)
print
(
"-"
*
100
)
device
=
current_platform
.
get_device_name
()
all_case_info
.
append
(
(
NUM_TOKENS
,
NUM_HEADS
,
HEAD_SIZE
,
output_dtype
,
device
,
avg_time_torch_kernel
,
avg_time_triton_kernel
,
avg_time_cuda_kernel
,
performance_improved
))
if
len
(
all_case_info
)
==
(
len
(
NUM_BATCH_TOKENS
)
*
len
(
HEAD_SIZES
)
*
len
(
NUM_QUERY_HEADS
)
*
len
(
DTYPES
)):
generate_markdown_table
()
tests/kernels/test_triton_moe_ptpc_fp8.py
0 → 100644
View file @
31330101
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py
import
itertools
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.platforms
import
current_platform
if
current_platform
.
get_device_capability
()
<
(
9
,
0
):
pytest
.
skip
(
"FP8 Triton requires CUDA 9.0 or higher"
,
allow_module_level
=
True
)
def
native_w8a8_per_token_matmul
(
A
,
B
,
As
,
Bs
,
output_dtype
=
torch
.
float16
):
"""Matrix multiplication function that supports per-token input
quantization and per-column weight quantization"""
A
=
A
.
to
(
torch
.
float32
)
B
=
B
.
to
(
torch
.
float32
)
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
],
"Dimension mismatch"
assert
B
.
ndim
==
2
and
B
.
is_contiguous
(
),
"B must be a 2D contiguous tensor"
# Reshape input
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
B
=
B
.
t
()
# Transpose weight matrix
N
,
K
=
B
.
shape
origin_C_shape
=
A
.
shape
[:
-
1
]
+
(
K
,
)
A
=
A
.
reshape
(
M
,
N
)
# As is per-token [M, 1], Bs is per-column [1, K]
C
=
torch
.
matmul
(
A
,
B
)
# [M, K]
C
=
As
*
C
*
Bs
.
view
(
1
,
-
1
)
# Broadcast per-column scale
return
C
.
reshape
(
origin_C_shape
).
to
(
output_dtype
)
def
fp8_mask
(
a
,
mask
):
dtype
=
a
.
dtype
return
a
.
view
(
torch
.
int8
)[
mask
].
view
(
dtype
)
def
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
):
"""This function performs fused moe with per-column int8
quantization using native torch."""
B
,
D
=
a
.
shape
# Perform per-token quantization
a_q
,
a_s
=
ops
.
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
# Repeat tokens to match topk
a_q
=
a_q
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
# Also repeat the scale
a_s
=
a_s
.
view
(
B
,
-
1
,
1
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
1
)
# [B*topk, 1]
out
=
torch
.
zeros
(
B
*
topk
,
w2
.
shape
[
1
],
dtype
=
a
.
dtype
,
device
=
a
.
device
)
# Calculate routing
score
=
torch
.
softmax
(
score
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk
)
topk_weight
=
topk_weight
.
view
(
-
1
)
topk_ids
=
topk_ids
.
view
(
-
1
)
# Process each expert
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
# First MLP layer: note that a_s is now per-token
inter_out
=
native_w8a8_per_token_matmul
(
fp8_mask
(
a_q
,
mask
),
w1
[
i
],
fp8_mask
(
a_s
,
mask
),
w1_s
[
i
],
output_dtype
=
a
.
dtype
,
)
# Activation function
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
# Quantize activation output with per-token
act_out_q
,
act_out_s
=
ops
.
scaled_fp8_quant
(
act_out
,
use_per_token_if_dynamic
=
True
)
# Second MLP layer
out
[
mask
]
=
native_w8a8_per_token_matmul
(
act_out_q
,
w2
[
i
],
act_out_s
,
w2_s
[
i
],
output_dtype
=
a
.
dtype
)
# Apply routing weights and sum
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"module"
)
def
setup_cuda
():
"""Sets the default CUDA device for all tests in this module."""
torch
.
set_default_device
(
"cuda"
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
M
=
[
1
,
33
]
N
=
[
128
,
1024
]
K
=
[
256
,
4096
]
E
=
[
8
]
TOP_KS
=
[
2
,
6
]
SEEDS
=
[
0
]
@
pytest
.
mark
.
parametrize
(
"M, N, K, E, topk, dtype, seed"
,
itertools
.
product
(
M
,
N
,
K
,
E
,
TOP_KS
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_fp8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
dtype
,
seed
):
torch
.
manual_seed
(
seed
)
# Initialize int8 quantization parameters
factor_for_scale
=
1e-2
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
=
finfo
.
max
fp8_min
=
finfo
.
min
# Input tensor
# M * K
a
=
torch
.
randn
((
M
,
K
),
dtype
=
dtype
)
/
10
# Generate int8 weights
w1_fp32
=
(
torch
.
rand
((
E
,
2
*
N
,
K
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w1
=
(
w1_fp32
*
fp8_max
).
clamp
(
min
=
fp8_min
,
max
=
fp8_max
).
to
(
torch
.
float8_e4m3fn
)
w2_fp32
=
(
torch
.
rand
((
E
,
K
,
N
),
dtype
=
torch
.
float32
)
-
0.5
)
*
2
w2
=
(
w2_fp32
*
fp8_max
).
clamp
(
min
=
fp8_min
,
max
=
fp8_max
).
to
(
torch
.
float8_e4m3fn
)
# Generate scale for each column (per-column quantization)
w1_s
=
torch
.
rand
(
E
,
2
*
N
,
device
=
w1_fp32
.
device
)
*
factor_for_scale
w2_s
=
torch
.
rand
(
E
,
K
,
device
=
w2_fp32
.
device
)
*
factor_for_scale
score
=
torch
.
randn
((
M
,
E
),
dtype
=
dtype
)
ref_out
=
torch_w8a8_per_column_moe
(
a
,
w1
,
w2
,
w1_s
,
w2_s
,
score
,
topk
)
out
=
fused_moe
(
a
,
w1
,
w2
,
score
,
topk
,
renormalize
=
False
,
use_fp8_w8a8
=
True
,
# using fp8
per_channel_quant
=
True
,
w1_scale
=
w1_s
,
w2_scale
=
w2_s
,
block_shape
=
None
,
# Not using block quantization
)
# Check results
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.05
tests/kernels/utils_block.py
0 → 100644
View file @
31330101
# SPDX-License-Identifier: Apache-2.0
import
torch
def
native_w8a8_block_matmul
(
A
:
torch
.
Tensor
,
B
:
torch
.
Tensor
,
As
:
torch
.
Tensor
,
Bs
:
torch
.
Tensor
,
block_size
,
output_dtype
):
"""This function performs matrix multiplication with block-wise
quantization using native torch.
It is agnostic to the input data type and can be used for both int8 and
fp8 data types.
It takes two input tensors `A` and `B` (int8) with scales `As` and
`Bs` (float32).
The output is returned in the specified `output_dtype`.
"""
A
=
A
.
to
(
torch
.
float32
)
B
=
B
.
to
(
torch
.
float32
)
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
]
assert
B
.
ndim
==
2
and
B
.
is_contiguous
()
and
Bs
.
ndim
==
2
assert
len
(
block_size
)
==
2
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
assert
(
A
.
shape
[
-
1
]
+
block_k
-
1
)
//
block_k
==
As
.
shape
[
-
1
]
assert
A
.
shape
[:
-
1
]
==
As
.
shape
[:
-
1
]
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
N
,
K
=
B
.
shape
origin_C_shape
=
A
.
shape
[:
-
1
]
+
(
N
,
)
A
=
A
.
reshape
(
M
,
A
.
shape
[
-
1
])
As
=
As
.
reshape
(
M
,
As
.
shape
[
-
1
])
n_tiles
=
(
N
+
block_n
-
1
)
//
block_n
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
assert
n_tiles
==
Bs
.
shape
[
0
]
assert
k_tiles
==
Bs
.
shape
[
1
]
C_shape
=
(
M
,
N
)
C
=
torch
.
zeros
(
C_shape
,
dtype
=
torch
.
float32
,
device
=
A
.
device
)
A_tiles
=
[
A
[:,
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
)]
for
i
in
range
(
k_tiles
)
]
B_tiles
=
[[
B
[
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
),
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
),
]
for
i
in
range
(
k_tiles
)
]
for
j
in
range
(
n_tiles
)]
C_tiles
=
[
C
[:,
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
)]
for
j
in
range
(
n_tiles
)
]
As_tiles
=
[
As
[:,
i
:
i
+
1
]
for
i
in
range
(
k_tiles
)]
for
i
in
range
(
k_tiles
):
for
j
in
range
(
n_tiles
):
a
=
A_tiles
[
i
]
b
=
B_tiles
[
j
][
i
]
c
=
C_tiles
[
j
]
s
=
As_tiles
[
i
]
*
Bs
[
j
][
i
]
c
[:,
:]
+=
torch
.
matmul
(
a
,
b
.
t
())
*
s
C
=
C
.
reshape
(
origin_C_shape
).
to
(
output_dtype
)
return
C
tests/lora/conftest.py
View file @
31330101
...
...
@@ -271,3 +271,15 @@ def run_with_both_engines_lora(request, monkeypatch):
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
@
pytest
.
fixture
def
reset_default_device
():
"""
Some tests, such as `test_punica_ops.py`, explicitly set the
default device, which can affect subsequent tests. Adding this fixture
helps avoid this problem.
"""
original_device
=
torch
.
get_default_device
()
yield
torch
.
set_default_device
(
original_device
)
tests/lora/test_baichuan.py
View file @
31330101
...
...
@@ -76,7 +76,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
max_num_seqs
=
16
,
max_loras
=
4
,
max_lora_rank
=
64
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
fully_sharded_loras
=
fully_sharded
)
output_tp1
=
do_sample
(
llm_tp1
,
baichuan_lora_files
,
lora_id
=
1
)
...
...
tests/lora/test_chatglm3_tp.py
View file @
31330101
...
...
@@ -63,7 +63,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
64
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
True
)
...
...
tests/lora/test_layers.py
View file @
31330101
...
...
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
@
pytest
.
fixture
(
autouse
=
True
)
def
clean_cache
(
):
def
clean_cache
_reset_device
(
reset_default_device
):
# Release any memory we might be holding on to. CI runs OOMs otherwise.
from
vllm.lora.ops.triton_ops.utils
import
(
_LORA_A_PTR_DICT
,
_LORA_B_PTR_DICT
)
...
...
tests/lora/test_llama_tp.py
View file @
31330101
...
...
@@ -89,7 +89,6 @@ def test_llama_lora(sql_lora_files):
# also test odd max_num_seqs
max_num_seqs
=
13
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
enable_chunked_prefill
=
True
)
generate_and_test
(
llm
,
sql_lora_files
)
...
...
tests/lora/test_punica_ops.py
View file @
31330101
...
...
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
from
.utils
import
PunicaTensors
,
assert_close
,
generate_data_for_nslices
@
pytest
.
fixture
(
autouse
=
True
)
def
reset_device
(
reset_default_device
):
pass
# Utility shrink and expand operations used as reference implementations.
def
sgmv_shrink_for_nslices
(
nslices
:
int
,
inputs_tensor
:
torch
.
Tensor
,
...
...
tests/lora/test_quant_model.py
View file @
31330101
...
...
@@ -81,12 +81,7 @@ def do_sample(llm: vllm.LLM,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_quant_model_lora
(
tinyllama_lora_files
,
num_gpus_available
,
model
,
tp_size
):
if
num_gpus_available
<
tp_size
and
\
tp_size
>
1
and
current_platform
.
is_cuda_alike
():
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
def
test_quant_model_lora
(
tinyllama_lora_files
,
model
):
llm
=
vllm
.
LLM
(
model
=
model
.
model_path
,
...
...
@@ -94,7 +89,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
max_num_seqs
=
16
,
max_loras
=
4
,
max_model_len
=
400
,
tensor_parallel_size
=
tp_size
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
trust_remote_code
=
True
,
...
...
@@ -188,7 +182,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
trust_remote_code
=
True
,
...
...
tests/lora/test_transfomers_model.py
View file @
31330101
...
...
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
16
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
True
)
...
...
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
31330101
...
...
@@ -13,8 +13,8 @@ from vllm.multimodal.audio import resample_audio
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
HfRunner
,
VllmRunner
from
....utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...registry
import
HF_EXAMPLE_MODELS
from
...utils
import
check_logprobs_close
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
)
...
...
@@ -58,7 +58,10 @@ def server(request, audio_assets):
for
key
,
value
in
request
.
param
.
items
()
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
{
"VLLM_AUDIO_FETCH_TIMEOUT"
:
"30"
})
as
remote_server
:
yield
remote_server
...
...
@@ -109,6 +112,10 @@ def run_test(
**
kwargs
,
):
"""Inference result should be the same between hf and vllm."""
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
...
...
@@ -159,6 +166,10 @@ def run_multi_audio_test(
num_logprobs
:
int
,
**
kwargs
,
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
,
...
...
tests/models/decoder_only/language/test_gguf.py
View file @
31330101
...
...
@@ -9,11 +9,13 @@ from typing import NamedTuple
import
pytest
from
huggingface_hub
import
hf_hub_download
from
pytest
import
MarkDecorator
from
transformers
import
AutoTokenizer
from
tests.quantization.utils
import
is_quant_method_supported
from
....conftest
import
VllmRunner
from
....utils
import
multi_gpu_test
from
...utils
import
check_logprobs_close
from
....utils
import
models_path_prefix
...
...
@@ -26,6 +28,7 @@ class GGUFTestConfig(NamedTuple):
original_model
:
str
gguf_repo
:
str
gguf_filename
:
str
marks
:
list
[
MarkDecorator
]
=
[]
@
property
def
gguf_model
(
self
):
...
...
@@ -36,6 +39,7 @@ LLAMA_CONFIG = GGUFTestConfig(
original_model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
gguf_repo
=
os
.
path
.
join
(
models_path_prefix
,
"bartowski/Llama-3.2-1B-Instruct-GGUF"
),
gguf_filename
=
os
.
path
.
join
(
models_path_prefix
,
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"
),
marks
=
[
pytest
.
mark
.
quant_model
],
)
QWEN2_CONFIG
=
GGUFTestConfig
(
...
...
@@ -82,34 +86,24 @@ MODELS = [
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gguf"
),
reason
=
"gguf is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
])
def
test_models
(
num_gpus_available
:
int
,
def
check_model_outputs
(
vllm_runner
:
type
[
VllmRunner
],
example_
prompts
:
list
[
str
],
prompts
:
list
[
str
],
model
:
GGUFTestConfig
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tp_size
:
int
,
)
->
None
:
if
num_gpus_available
<
tp_size
:
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
.
original_model
)
if
tokenizer
.
chat_template
is
not
None
:
messages
=
[[{
'role'
:
'user'
,
'content'
:
prompt
}]
for
prompt
in
example_prompts
]
example_prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
}]
for
prompt
in
prompts
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
# Run gguf model.
with
vllm_runner
(
model_name
=
model
.
gguf_model
,
...
...
@@ -119,17 +113,19 @@ def test_models(
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tp_size
)
as
gguf_model
:
gguf_outputs
=
gguf_model
.
generate_greedy_logprobs
(
example_
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
# Run unquantized model.
# Should run with tp=1, otherwise the test will stuck at
# nccl initialization.
with
vllm_runner
(
model_name
=
model
.
original_model
,
enforce_eager
=
True
,
# faster tests
dtype
=
dtype
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tp_size
)
as
original_model
:
tensor_parallel_size
=
1
)
as
original_model
:
original_outputs
=
original_model
.
generate_greedy_logprobs
(
example_
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
prompts
[:
-
1
],
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
original_outputs
,
...
...
@@ -137,3 +133,47 @@ def test_models(
name_0
=
"original"
,
name_1
=
"gguf"
,
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gguf"
),
reason
=
"gguf is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
pytest
.
param
(
test_config
,
marks
=
test_config
.
marks
)
for
test_config
in
MODELS
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_models
(
vllm_runner
:
type
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
GGUFTestConfig
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tp_size
:
int
,
)
->
None
:
check_model_outputs
(
vllm_runner
,
example_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
tp_size
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gguf"
),
reason
=
"gguf is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
LLAMA_CONFIG
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_distributed
(
vllm_runner
:
type
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
GGUFTestConfig
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tp_size
:
int
,
)
->
None
:
check_model_outputs
(
vllm_runner
,
example_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
tp_size
)
tests/models/decoder_only/vision_language/test_models.py
View file @
31330101
...
...
@@ -163,17 +163,32 @@ VLM_TEST_SETTINGS = {
),
"aya_vision"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"CohereForAI/aya-vision-8b"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
{
img_prompt
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>What is the season?"
,
# noqa: E501
}),
multi_image_prompt
=
"<image><image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
8192
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"crop_to_patches"
:
True
}},
),
"aya_vision-multi_image"
:
VLMTestInfo
(
models
=
[
"CohereForAI/aya-vision-8b"
],
test_type
=
(
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>
{
img_prompt
}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
,
# noqa: E501
single_image_prompts
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>What's the content in the center of the image?"
,
# noqa: E501
"cherry_blossom"
:
"<image>What is the season?"
,
# noqa: E501
}),
multi_image_prompt
=
"<image><image>Describe the two images in detail."
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"crop_to_patches"
:
True
}}
vllm_runner_kwargs
=
{
"mm_processor_kwargs"
:
{
"crop_to_patches"
:
True
}},
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
"blip2"
:
VLMTestInfo
(
# TODO: Change back to 2.7b once head_dim = 80 is supported
...
...
@@ -306,6 +321,21 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
),
"llama4"
:
VLMTestInfo
(
models
=
[
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
],
prompt_formatter
=
lambda
img_prompt
:
f
"<|begin_of_text|><|header_start|>user<|header_end|>
\n\n
{
img_prompt
}
<|eot|><|header_start|>assistant<|header_end|>
\n\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
_
:
"<|image|>"
,
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
distributed_executor_backend
=
"mp"
,
image_size_factors
=
[(.
25
,
0.5
,
1.0
)],
hf_model_kwargs
=
{
"device_map"
:
"auto"
},
max_model_len
=
8192
,
max_num_seqs
=
4
,
dtype
=
"bfloat16"
,
auto_cls
=
AutoModelForImageTextToText
,
tensor_parallel_size
=
4
,
marks
=
multi_gpu_marks
(
num_gpus
=
4
),
),
"llava_next"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
CUSTOM_INPUTS
),
...
...
@@ -398,23 +428,20 @@ VLM_TEST_SETTINGS = {
max_num_seqs
=
2
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
),
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# https://github.com/huggingface/transformers/issues/34307
# "phi3v": VLMTestInfo(
# models=["microsoft/Phi-3.5-vision-instruct"],
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
# max_model_len=4096,
# max_num_seqs=2,
# task="generate",
# # use eager mode for hf runner since phi3v didn't work with flash_attn
# hf_model_kwargs={"_attn_implementation": "eager"},
# use_tokenizer_eos=True,
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
# num_logprobs=10,
# ),
"phi3v"
:
VLMTestInfo
(
models
=
[
"microsoft/Phi-3.5-vision-instruct"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|user|>
\n
{
img_prompt
}
<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
f
"<|image_
{
idx
}
|>
\n
"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
task
=
"generate"
,
# use sdpa mode for hf runner since phi3v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"sdpa"
},
use_tokenizer_eos
=
True
,
vllm_output_post_proc
=
model_utils
.
phi3v_vllm_to_hf_output
,
num_logprobs
=
10
,
),
"pixtral_hf"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/pixtral-12b-FP8-dynamic"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
...
...
@@ -466,6 +493,16 @@ VLM_TEST_SETTINGS = {
patch_hf_runner
=
model_utils
.
skyworkr1v_patch_hf_runner
,
marks
=
[
large_gpu_mark
(
min_gb
=
80
)],
),
"smolvlm"
:
VLMTestInfo
(
models
=
[
"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User:
{
img_prompt
}
<end_of_utterance>
\n
Assistant:"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<image>"
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
hf_output_post_proc
=
model_utils
.
smolvlm_trunc_hf_output
,
),
### Tensor parallel / multi-gpu broadcast tests
"chameleon-broadcast"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
)],
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment