Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a99300bd
Commit
a99300bd
authored
Sep 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev
parents
cc3e01c7
5438967f
Changes
512
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
946 additions
and
223 deletions
+946
-223
tests/kernels/moe/untest_pplx_cutlass_moe.py
tests/kernels/moe/untest_pplx_cutlass_moe.py
+23
-1
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
+2
-2
tests/kernels/quantization/test_awq_triton.py
tests/kernels/quantization/test_awq_triton.py
+1
-1
tests/kernels/quantization/test_cutlass_w4a8.py
tests/kernels/quantization/test_cutlass_w4a8.py
+259
-0
tests/kernels/quantization/test_flashinfer_scaled_mm.py
tests/kernels/quantization/test_flashinfer_scaled_mm.py
+73
-0
tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+126
-0
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+1
-1
tests/kernels/quantization/untest_cutlass_2of4_sparse.py
tests/kernels/quantization/untest_cutlass_2of4_sparse.py
+1
-1
tests/kernels/quantization/untest_cutlass_scaled_mm.py
tests/kernels/quantization/untest_cutlass_scaled_mm.py
+2
-2
tests/kernels/quantization/untest_machete_mm.py
tests/kernels/quantization/untest_machete_mm.py
+18
-18
tests/kernels/quantization/untest_marlin_gemm.py
tests/kernels/quantization/untest_marlin_gemm.py
+1
-84
tests/kernels/quantization/untest_triton_scaled_mm.py
tests/kernels/quantization/untest_triton_scaled_mm.py
+124
-0
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+88
-23
tests/kernels/test_onednn.py
tests/kernels/test_onednn.py
+144
-0
tests/lora/conftest.py
tests/lora/conftest.py
+4
-38
tests/lora/test_add_lora.py
tests/lora/test_add_lora.py
+4
-7
tests/lora/test_chatglm3_tp.py
tests/lora/test_chatglm3_tp.py
+5
-1
tests/lora/test_layers.py
tests/lora/test_layers.py
+33
-39
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+1
-4
tests/lora/test_llm_with_multi_loras.py
tests/lora/test_llm_with_multi_loras.py
+36
-1
No files found.
Too many changes to show.
To preserve performance only
512 of 512+
files are displayed.
Plain diff
Email patch
tests/kernels/moe/untest_pplx_cutlass_moe.py
View file @
a99300bd
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
cdiv
from
vllm.utils
import
cdiv
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
try
:
try
:
...
@@ -76,6 +77,7 @@ def pplx_cutlass_moe(
...
@@ -76,6 +77,7 @@ def pplx_cutlass_moe(
assert
torch
.
cuda
.
current_device
()
==
pgi
.
local_rank
assert
torch
.
cuda
.
current_device
()
==
pgi
.
local_rank
num_tokens
,
hidden_dim
=
a
.
shape
num_tokens
,
hidden_dim
=
a
.
shape
intermediate_dim
=
w2
.
shape
[
2
]
num_experts
=
w1
.
shape
[
0
]
num_experts
=
w1
.
shape
[
0
]
block_size
=
hidden_dim
# TODO support more cases
block_size
=
hidden_dim
# TODO support more cases
device
=
pgi
.
device
device
=
pgi
.
device
...
@@ -124,8 +126,27 @@ def pplx_cutlass_moe(
...
@@ -124,8 +126,27 @@ def pplx_cutlass_moe(
num_local_experts
=
num_local_experts
,
num_local_experts
=
num_local_experts
,
num_dispatchers
=
num_dispatchers
)
num_dispatchers
=
num_dispatchers
)
ab_strides1
=
torch
.
full
((
num_local_experts
,
),
hidden_dim
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
num_local_experts
,
),
intermediate_dim
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
num_local_experts
,
),
2
*
intermediate_dim
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
num_local_experts
,
),
hidden_dim
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
experts
=
CutlassBatchedExpertsFp8
(
num_local_experts
,
num_dispatchers
,
experts
=
CutlassBatchedExpertsFp8
(
num_local_experts
,
num_dispatchers
,
out_dtype
,
per_act_token
,
per_out_ch
)
out_dtype
,
per_act_token
,
per_out_ch
,
ab_strides1
,
ab_strides2
,
c_strides1
,
c_strides2
)
fused_cutlass_experts
=
FusedMoEModularKernel
(
fused_cutlass_experts
=
FusedMoEModularKernel
(
prepare_finalize
,
prepare_finalize
,
...
@@ -227,6 +248,7 @@ def _pplx_moe(
...
@@ -227,6 +248,7 @@ def _pplx_moe(
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[[
2
,
1
]])
#, [4, 2]])
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[[
2
,
1
]])
#, [4, 2]])
@
pytest
.
mark
.
parametrize
(
"use_internode"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_internode"
,
[
False
])
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
current_platform
.
get_device_capability
()),
current_platform
.
get_device_capability
()),
...
...
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
View file @
a99300bd
...
@@ -24,7 +24,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
...
@@ -24,7 +24,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
current_platform
.
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
# Input tensor of shape (E, T, 2*H)
# Input tensor of shape (E, T, 2*H)
y
=
torch
.
randn
((
E
,
T
,
2
*
H
),
dtype
=
torch
.
float
32
,
device
=
"cuda"
)
y
=
torch
.
randn
((
E
,
T
,
2
*
H
),
dtype
=
torch
.
b
float
16
,
device
=
"cuda"
)
tokens_per_expert
=
torch
.
randint
(
tokens_per_expert
=
torch
.
randint
(
low
=
0
,
low
=
0
,
high
=
T
,
high
=
T
,
...
@@ -74,7 +74,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
...
@@ -74,7 +74,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
y_se
=
y_s
[
e
]
y_se
=
y_s
[
e
]
y_qe
=
y_q
[
e
]
y_qe
=
y_q
[
e
]
torch
.
testing
.
assert_close
(
y_se
[:
nt
],
ref_s
[:
nt
])
torch
.
testing
.
assert_close
(
y_se
[:
nt
],
ref_s
[:
nt
]
,
atol
=
1e-4
,
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
torch
.
testing
.
assert_close
(
y_qe
[:
nt
].
to
(
torch
.
float32
),
y_qe
[:
nt
].
to
(
torch
.
float32
),
ref_q
[:
nt
].
to
(
torch
.
float32
),
ref_q
[:
nt
].
to
(
torch
.
float32
),
...
...
tests/kernels/quantization/test_awq_triton.py
View file @
a99300bd
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the AWQ Triton kernel.
"""Tests for the AWQ Triton kernel.
Run `pytest tests/kernels/test_awq_triton.py`.
Run `pytest tests/kernels/
quantization/
test_awq_triton.py`.
"""
"""
import
pytest
import
pytest
import
torch
import
torch
...
...
tests/kernels/quantization/test_cutlass_w4a8.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the CUTLASS W4A8 kernel.
Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
"""
from
dataclasses
import
dataclass
from
typing
import
Optional
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
pack_rows
,
quantize_weights
)
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
ScalarType
,
scalar_types
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# unit tests to a common utility function. Currently the use of
# `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU
=
current_platform
.
get_device_capability
()[
0
]
>=
9
MNK_SHAPES
=
[(
1
,
128
,
128
),
(
1
,
512
,
1024
),
(
1
,
4096
,
4096
),
(
1
,
8192
,
28672
),
(
13
,
8192
,
4096
),
(
26
,
4096
,
8192
),
(
64
,
4096
,
4096
),
(
64
,
8192
,
28672
),
(
257
,
128
,
4096
),
(
257
,
4096
,
4096
),
(
1024
,
4096
,
8192
),
(
1024
,
8192
,
4096
)]
# TODO(czhu): get supported schedules from fn
SCHEDULES
=
[
'128x16_1x1x1'
,
'256x16_1x1x1'
,
'128x32_1x1x1'
,
'256x32_1x1x1'
,
'128x64_1x1x1'
,
'256x64_1x1x1'
,
'128x128_1x1x1'
,
'256x128_1x1x1'
,
'128x256_1x1x1'
,
'128x256_2x1x1'
]
@
dataclass
class
TypeConfig
:
act_type
:
torch
.
dtype
weight_type
:
ScalarType
output_type
:
Optional
[
torch
.
dtype
]
group_scale_type
:
Optional
[
torch
.
dtype
]
channel_scale_type
:
Optional
[
torch
.
dtype
]
token_scale_type
:
Optional
[
torch
.
dtype
]
@
dataclass
class
Tensors
:
w_ref
:
torch
.
Tensor
a_ref
:
torch
.
Tensor
a
:
torch
.
Tensor
w_q
:
torch
.
Tensor
w_g_s
:
torch
.
Tensor
w_ch_s
:
torch
.
Tensor
w_tok_s
:
torch
.
Tensor
# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
# Ch Scales Type, Tok Scales Type)
TestTypeTuple
=
tuple
[
list
[
torch
.
dtype
],
ScalarType
,
Optional
[
torch
.
dtype
],
Optional
[
torch
.
dtype
],
bool
]
TEST_TYPES
=
[
*
(
TypeConfig
(
act_type
=
torch
.
float8_e4m3fn
,
weight_type
=
w_type
,
output_type
=
o_type
,
group_scale_type
=
torch
.
float8_e4m3fn
,
channel_scale_type
=
torch
.
float32
,
token_scale_type
=
torch
.
float32
)
for
w_type
in
[
scalar_types
.
int4
]
# TODO(czhu): fp16 out type
for
o_type
in
[
torch
.
bfloat16
]),
]
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# unit tests to a common utility function. Currently the use of
# `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU
=
current_platform
.
has_device_capability
(
90
)
# For testing quantized linear kernels
def
to_fp8
(
tensor
:
torch
.
Tensor
):
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
cutlass_quantize_and_pack
(
atype
:
torch
.
dtype
,
w
:
torch
.
Tensor
,
wtype
:
ScalarType
,
stype
:
Optional
[
torch
.
dtype
],
group_size
:
Optional
[
int
],
zero_points
:
bool
=
False
):
assert
wtype
.
is_integer
(),
"TODO: support floating point weights"
w_ref
,
w_q
,
w_s
,
w_zp
=
quantize_weights
(
w
,
wtype
,
group_size
=
group_size
,
zero_points
=
zero_points
)
# since scales are cast to fp8, we need to compute w_ref this way
w_ref
=
((
w_q
).
to
(
torch
.
float32
)
*
w_s
.
to
(
atype
).
to
(
torch
.
float32
).
repeat_interleave
(
group_size
,
dim
=
0
)).
to
(
atype
)
# bit mask prevents sign extending int4 when packing
w_q
=
pack_rows
(
w_q
&
0x0F
,
wtype
.
size_bits
,
*
w_q
.
shape
)
w_q
=
w_q
.
t
().
contiguous
().
t
()
# convert to col major
w_q_packed
=
ops
.
cutlass_encode_and_reorder_int4b
(
w_q
)
w_s_packed
=
ops
.
cutlass_pack_scale_fp8
(
w_s
.
to
(
atype
))
return
w_ref
,
w_q_packed
,
w_s_packed
,
w_zp
def
create_test_tensors
(
shape
:
tuple
[
int
,
int
,
int
],
types
:
TypeConfig
,
group_size
:
Optional
[
int
])
->
Tensors
:
m
,
n
,
k
=
shape
print
(
"create_test_tensors, shape:"
,
shape
,
"types:"
,
types
,
"group_size:"
,
group_size
)
a
=
to_fp8
(
torch
.
randn
((
m
,
k
),
device
=
"cuda"
))
w
=
to_fp8
(
torch
.
randn
((
k
,
n
),
device
=
"cuda"
))
if
types
.
group_scale_type
is
not
None
:
w
=
w
.
to
(
types
.
group_scale_type
)
if
w
.
dtype
.
itemsize
==
1
:
w
=
w
.
to
(
torch
.
float16
)
w_ref
,
w_q_packed
,
w_s
,
_
=
cutlass_quantize_and_pack
(
a
.
dtype
,
w
,
types
.
weight_type
,
types
.
group_scale_type
,
group_size
,
False
)
a_ref
=
a
.
to
(
torch
.
float32
)
w_ref
=
w_ref
.
to
(
torch
.
float32
)
# for the practical use case we need per-tok scales for fp8 activations
w_tok_s
=
torch
.
randn
((
m
,
),
device
=
'cuda'
,
dtype
=
types
.
token_scale_type
)
# weights are already per-group quantized, use placeholder here
w_ch_s
=
torch
.
ones
((
n
,
),
device
=
'cuda'
,
dtype
=
types
.
channel_scale_type
)
return
Tensors
(
w_ref
=
w_ref
,
a_ref
=
a_ref
,
a
=
a
,
w_q
=
w_q_packed
,
w_g_s
=
w_s
,
w_ch_s
=
w_ch_s
,
w_tok_s
=
w_tok_s
)
def
mm_test_helper
(
types
:
TypeConfig
,
tensors
:
Tensors
,
group_size
:
Optional
[
int
]
=
None
,
schedule
:
Optional
[
str
]
=
None
):
# CUTLASS upstream uses fp8 with fastaccum as reference
# https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
output_ref
=
torch
.
_scaled_mm
(
tensors
.
a_ref
.
to
(
types
.
act_type
),
tensors
.
w_ref
.
to
(
types
.
act_type
).
t
().
contiguous
().
t
(),
# col major
tensors
.
w_tok_s
.
unsqueeze
(
1
),
tensors
.
w_ch_s
.
unsqueeze
(
0
),
out_dtype
=
types
.
output_type
,
use_fast_accum
=
True
)
output
=
ops
.
cutlass_w4a8_mm
(
a
=
tensors
.
a
,
b_q
=
tensors
.
w_q
,
b_group_scales
=
tensors
.
w_g_s
,
b_group_size
=
group_size
,
b_channel_scales
=
tensors
.
w_ch_s
,
a_token_scales
=
tensors
.
w_tok_s
,
)
print
(
output
)
print
(
output_ref
)
torch
.
testing
.
assert_close
(
output
,
output_ref
.
to
(
output
.
dtype
),
rtol
=
1e-3
,
atol
=
1e-3
)
@
pytest
.
mark
.
skipif
(
not
IS_SUPPORTED_BY_GPU
,
reason
=
"CUTLASS W4A8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"shape"
,
MNK_SHAPES
,
ids
=
lambda
x
:
"x"
.
join
(
str
(
v
)
for
v
in
x
))
@
pytest
.
mark
.
parametrize
(
"types"
,
TEST_TYPES
)
@
pytest
.
mark
.
parametrize
(
"schedule"
,
SCHEDULES
)
def
test_cutlass_w4a8
(
shape
,
types
:
TypeConfig
,
schedule
):
group_sizes
=
[
128
]
for
group_size
in
group_sizes
:
tensors
=
create_test_tensors
(
shape
,
types
,
group_size
)
mm_test_helper
(
types
,
tensors
,
group_size
,
schedule
)
# Test to make sure cuda graphs work
class
W4A8Layer
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
()
self
.
kwargs
=
kwargs
def
forward
(
self
,
a
):
return
ops
.
cutlass_w4a8_mm
(
a
=
a
,
**
self
.
kwargs
)
@
pytest
.
mark
.
skipif
(
not
IS_SUPPORTED_BY_GPU
,
reason
=
"CUTLASS W4A8 is not supported on this GPU type."
)
def
test_w4a8_cuda_graph
():
m
,
n
,
k
=
512
,
4096
,
4096
a
=
to_fp8
(
torch
.
randn
((
m
,
k
),
device
=
"cuda"
))
b
=
to_fp8
(
torch
.
randn
((
k
,
n
),
device
=
"cuda"
))
wtype
=
scalar_types
.
int4
stype
=
torch
.
float8_e4m3fn
group_size
=
128
zero_points
=
False
w_ref
,
w_q_packed
,
w_s
,
_
=
cutlass_quantize_and_pack
(
a
.
dtype
,
b
.
to
(
torch
.
float16
),
wtype
,
stype
,
group_size
,
zero_points
)
w_tok_s
=
torch
.
randn
((
m
,
),
device
=
'cuda'
,
dtype
=
torch
.
float32
)
w_ch_s
=
torch
.
ones
((
n
,
),
device
=
'cuda'
,
dtype
=
torch
.
float32
)
# Construct a trivial model with a single layer that calls the kernel
model
=
W4A8Layer
(
b_q
=
w_q_packed
,
b_group_scales
=
w_s
,
b_group_size
=
group_size
,
b_channel_scales
=
w_ch_s
,
a_token_scales
=
w_tok_s
,
)
output_ref
=
torch
.
_scaled_mm
(
a
,
w_ref
.
to
(
a
.
dtype
).
t
().
contiguous
().
t
(),
# col major
w_tok_s
.
unsqueeze
(
1
),
w_ch_s
.
unsqueeze
(
0
),
out_dtype
=
torch
.
bfloat16
,
use_fast_accum
=
True
)
# Run the model with a cuda graph
stream
=
torch
.
cuda
.
Stream
()
with
torch
.
cuda
.
stream
(
stream
):
g
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
g
):
output
=
model
(
a
)
output
.
zero_
()
g
.
replay
()
torch
.
testing
.
assert_close
(
output
,
output_ref
,
rtol
=
1e-3
,
atol
=
1e-3
)
tests/kernels/quantization/test_flashinfer_scaled_mm.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
flashinfer_scaled_fp8_mm
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
reason
=
"Flashinfer FP8 gemms requires compute capability of 10.0 or above."
,
allow_module_level
=
True
,
)
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
# m, n, k
SHAPES
=
[(
128
,
128
,
64
),
(
128
,
128
,
128
),
(
256
,
128
,
64
),
(
128
,
256
,
128
)]
PAD_SHAPES
=
[(
150
,
128
,
64
),
(
128
,
128
,
96
)]
SHAPES
.
extend
(
PAD_SHAPES
)
SEEDS
=
[
42
]
CUDA_DEVICES
=
[
"cuda:0"
]
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"shape"
,
SHAPES
)
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"autotune"
,
[
False
,
True
])
@
torch
.
inference_mode
()
def
test_flashinfer_fp8_gemm
(
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
int
,
int
],
use_bias
:
bool
,
seed
:
int
,
device
:
str
,
autotune
:
bool
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
m
,
n
,
k
=
shape
a
=
torch
.
randn
((
m
,
k
),
dtype
=
dtype
,
device
=
device
)
b
=
torch
.
randn
((
n
,
k
),
dtype
=
dtype
,
device
=
device
)
/
k
a_fp8
,
a_scale
=
ops
.
scaled_fp8_quant
(
a
)
b_fp8
,
b_scale
=
ops
.
scaled_fp8_quant
(
b
)
expected_out
=
torch
.
mm
(
a_scale
*
a_fp8
.
to
(
dtype
=
torch
.
float32
),
b_scale
*
b_fp8
.
to
(
dtype
=
torch
.
float32
).
t
(),
).
to
(
dtype
=
dtype
)
if
use_bias
:
bias
=
torch
.
randn
((
n
,
),
dtype
=
dtype
,
device
=
device
)
expected_out
=
expected_out
+
bias
else
:
bias
=
None
import
flashinfer
with
flashinfer
.
autotune
(
autotune
):
out
=
flashinfer_scaled_fp8_mm
(
a_fp8
,
b_fp8
.
t
(),
a_scale
,
b_scale
,
dtype
,
bias
=
bias
,
)
torch
.
testing
.
assert_close
(
out
,
expected_out
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
tests.kernels.utils
import
opcheck
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
reason
=
"Nvfp4 Requires compute capability of 10 or above."
,
allow_module_level
=
True
)
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
SHAPES
=
[(
128
,
64
),
(
128
,
128
),
(
256
,
64
),
(
256
,
128
)]
SEEDS
=
[
42
]
CUDA_DEVICES
=
[
'cuda:0'
]
FLOAT4_E2M1_MAX
=
scalar_types
.
float4_e2m1f
.
max
()
FLOAT8_E4M3_MAX
=
torch
.
finfo
(
torch
.
float8_e4m3fn
).
max
BLOCK_SIZE
=
16
def
ref_impl
(
silu_and_mul
:
SiluAndMul
,
x
:
torch
.
Tensor
,
global_scale
:
torch
.
Tensor
,
ref_output_scale
:
torch
.
Tensor
)
->
torch
.
Tensor
:
silu_and_mul_out
=
silu_and_mul
.
forward_native
(
x
)
assert
not
current_platform
.
is_rocm
()
assert
silu_and_mul_out
.
ndim
>=
1
,
(
f
'input.ndim needs to be >= 1, but got
{
silu_and_mul_out
.
ndim
}
.'
)
other_dims
=
1
if
silu_and_mul_out
.
ndim
==
1
else
-
1
silu_and_mul_out
=
silu_and_mul_out
.
reshape
(
other_dims
,
silu_and_mul_out
.
shape
[
-
1
])
m
,
n
=
silu_and_mul_out
.
shape
device
=
silu_and_mul_out
.
device
# Two fp4 values will be packed into an uint8.
out
=
torch
.
empty
((
m
,
n
//
2
),
device
=
device
,
dtype
=
torch
.
uint8
)
output_scale
=
ref_output_scale
torch
.
ops
.
_C
.
scaled_fp4_quant
(
out
,
silu_and_mul_out
,
output_scale
,
global_scale
)
return
out
,
output_scale
def
ops_impl
(
x
:
torch
.
Tensor
,
global_scale
:
torch
.
Tensor
,
ref_output_scale
:
torch
.
Tensor
)
->
torch
.
Tensor
:
out_shape
=
(
x
.
shape
[
0
],
x
.
shape
[
1
]
//
4
)
output_scale
=
ref_output_scale
out
=
torch
.
empty
(
out_shape
,
dtype
=
torch
.
uint8
,
device
=
x
.
device
)
torch
.
ops
.
_C
.
silu_and_mul_nvfp4_quant
(
out
,
output_scale
,
x
,
global_scale
)
return
out
,
output_scale
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"shape"
,
SHAPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_quantize_to_fp4
(
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
int
],
seed
:
int
,
device
:
str
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
m
,
n
=
shape
x
=
torch
.
randn
((
m
,
n
),
dtype
=
dtype
)
tensor_amax
=
torch
.
abs
(
x
).
max
().
to
(
torch
.
float32
)
global_scale
=
FLOAT8_E4M3_MAX
*
FLOAT4_E2M1_MAX
/
tensor_amax
block_size
=
16
assert
n
%
block_size
==
0
,
(
f
'last dim has to be multiple of 16, but got
{
n
}
.'
)
assert
x
.
dtype
in
(
torch
.
float16
,
torch
.
bfloat16
),
(
f
'input.dtype needs to be fp16 or bf16 but got
{
x
.
dtype
}
.'
)
round_up
=
lambda
x
,
y
:
(
x
+
y
-
1
)
//
y
*
y
rounded_m
=
round_up
(
x
.
shape
[
0
],
128
)
scale_n
=
x
.
shape
[
1
]
//
(
2
*
block_size
)
rounded_n
=
round_up
(
scale_n
,
4
)
output_scale
=
torch
.
empty
((
rounded_m
,
rounded_n
//
4
),
device
=
x
.
device
,
dtype
=
torch
.
int32
)
layer
=
SiluAndMul
()
ref_out
,
ref_out_scale
=
ref_impl
(
layer
,
x
,
global_scale
,
output_scale
)
fusion_out
,
fusion_out_scale
=
ops_impl
(
x
,
global_scale
,
output_scale
)
assert
ref_out
.
dtype
==
torch
.
uint8
assert
fusion_out
.
dtype
==
torch
.
uint8
assert
ref_out
.
shape
==
fusion_out
.
shape
assert
ref_out_scale
.
dtype
==
torch
.
int32
assert
fusion_out_scale
.
dtype
==
torch
.
int32
assert
ref_out_scale
.
shape
==
fusion_out_scale
.
shape
# Allow up to 2% of mismatched values since BF16 has accuracy issues.
mis_threshold
=
0.02
atol
=
0.4
rtol
=
0.4
ref_logits
=
ref_out
[
-
1
]
fusion_logits
=
fusion_out
[
-
1
]
mis_count
=
torch
.
sum
(
torch
.
abs
(
fusion_logits
-
ref_logits
)
>
(
atol
+
rtol
*
torch
.
abs
(
ref_logits
)))
mis_ratio
=
mis_count
/
fusion_logits
.
numel
()
assert
mis_ratio
<
mis_threshold
,
\
f
"Mismatch ratio
{
mis_ratio
}
exceeds threshold
{
mis_threshold
}
"
torch
.
testing
.
assert_close
(
ref_out_scale
,
fusion_out_scale
)
opcheck
(
torch
.
ops
.
_C
.
silu_and_mul_nvfp4_quant
,
(
fusion_out
,
fusion_out_scale
,
x
,
global_scale
))
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
a99300bd
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_scaled_mm kernel
"""Tests for the triton_scaled_mm kernel
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
Run `pytest tests/kernels/
quantization/
test_triton_scaled_mm.py`.
"""
"""
import
os
import
os
import
importlib
import
importlib
...
...
tests/kernels/quantization/untest_cutlass_2of4_sparse.py
View file @
a99300bd
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for sparse cutlass kernels
"""Tests for sparse cutlass kernels
Run `pytest tests/kernels/
test_semi_structured
.py`.
Run `pytest tests/kernels/
quantization/test_cutlass_2of4_sparse
.py`.
"""
"""
import
pytest
import
pytest
...
...
tests/kernels/quantization/untest_cutlass_scaled_mm.py
View file @
a99300bd
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for cutlass kernels
"""Tests for cutlass kernels
Run `pytest tests/kernels/test_cutlass.py`.
Run `pytest tests/kernels/
quantization/
test_cutlass
_scaled_mm
.py`.
"""
"""
import
random
import
random
...
@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
...
@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
expert_offsets
=
torch
.
zeros
((
num_experts
+
1
),
expert_offsets
=
torch
.
zeros
((
num_experts
+
1
),
device
=
device
,
device
=
device
,
dtype
=
torch
.
int
32
)
dtype
=
torch
.
int
64
)
problem_sizes
=
torch
.
zeros
((
num_experts
,
3
),
problem_sizes
=
torch
.
zeros
((
num_experts
,
3
),
device
=
device
,
device
=
device
,
...
...
tests/kernels/quantization/untest_machete_mm.py
View file @
a99300bd
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the machete kernel.
"""Tests for the machete kernel.
Run `pytest tests/kernels/test_machete_mm.py`.
Run `pytest tests/kernels/
quantization/
test_machete_mm.py`.
"""
"""
import
math
import
math
...
@@ -95,23 +95,23 @@ TEST_TYPES = [
...
@@ -95,23 +95,23 @@ TEST_TYPES = [
token_scale_type
=
None
)
token_scale_type
=
None
)
for
w_type
in
[
scalar_types
.
uint4
,
scalar_types
.
uint8
]
for
w_type
in
[
scalar_types
.
uint4
,
scalar_types
.
uint8
]
for
a_type
in
[
torch
.
float16
,
torch
.
bfloat16
]),
for
a_type
in
[
torch
.
float16
,
torch
.
bfloat16
]),
# QQQ style
#
#
QQQ style
*
(
TypeConfig
(
act_type
=
torch
.
int8
,
#
*(TypeConfig(act_type=torch.int8,
weight_type
=
scalar_types
.
uint4b8
,
#
weight_type=scalar_types.uint4b8,
output_type
=
torch
.
float16
,
#
output_type=torch.float16,
group_scale_type
=
group_scale_type
,
#
group_scale_type=group_scale_type,
group_zero_type
=
None
,
#
group_zero_type=None,
channel_scale_type
=
torch
.
float
,
#
channel_scale_type=torch.float,
token_scale_type
=
torch
.
float
)
#
token_scale_type=torch.float)
for
group_scale_type
in
[
None
,
torch
.
float16
]),
#
for group_scale_type in [None, torch.float16]),
*
(
TypeConfig
(
act_type
=
torch
.
float8_e4m3fn
,
#
*(TypeConfig(act_type=torch.float8_e4m3fn,
weight_type
=
scalar_types
.
uint4b8
,
#
weight_type=scalar_types.uint4b8,
output_type
=
torch
.
float16
,
#
output_type=torch.float16,
group_scale_type
=
group_scale_type
,
#
group_scale_type=group_scale_type,
group_zero_type
=
None
,
#
group_zero_type=None,
channel_scale_type
=
torch
.
float
,
#
channel_scale_type=torch.float,
token_scale_type
=
torch
.
float
)
#
token_scale_type=torch.float)
for
group_scale_type
in
[
None
,
torch
.
float16
]),
#
for group_scale_type in [None, torch.float16]),
]
]
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
...
...
tests/kernels/quantization/untest_marlin_gemm.py
View file @
a99300bd
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the marlin kernel.
"""Tests for the marlin kernel.
Run `pytest tests/kernels/
marli
n/test_marlin_gemm.py`.
Run `pytest tests/kernels/
quantizatio
n/test_marlin_gemm.py`.
"""
"""
import
pytest
import
pytest
import
torch
import
torch
...
@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
...
@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
from
vllm.model_executor.layers.quantization.gptq_marlin_24
import
(
from
vllm.model_executor.layers.quantization.gptq_marlin_24
import
(
GPTQ_MARLIN_24_MAX_PARALLEL
,
GPTQ_MARLIN_24_MIN_THREAD_N
,
GPTQ_MARLIN_24_MAX_PARALLEL
,
GPTQ_MARLIN_24_MIN_THREAD_N
,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
)
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
)
from
vllm.model_executor.layers.quantization.qqq
import
(
MARLIN_QQQ_MAX_PARALLEL
,
MARLIN_QQQ_MIN_THREAD_N
,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES
,
MARLIN_QQQ_SUPPORTED_NUM_BITS
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
GPTQ_MARLIN_MAX_PARALLEL
,
GPTQ_MARLIN_MIN_THREAD_N
,
MARLIN_SUPPORTED_GROUP_SIZES
,
marlin_make_empty_g_idx
,
MARLIN_SUPPORTED_GROUP_SIZES
,
marlin_make_empty_g_idx
,
marlin_make_workspace_new
,
marlin_permute_bias
,
marlin_permute_scales
,
marlin_make_workspace_new
,
marlin_permute_bias
,
marlin_permute_scales
,
query_marlin_supported_quant_types
)
query_marlin_supported_quant_types
)
...
@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
...
@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
marlin_weights
)
marlin_weights
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_24
import
(
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_24
import
(
marlin_24_quantize
)
marlin_24_quantize
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq
import
(
# noqa: E501
marlin_qqq_quantize
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
awq_pack
,
gptq_pack
,
gptq_quantize_weights
,
quantize_weights
,
sort_weights
)
awq_pack
,
gptq_pack
,
gptq_quantize_weights
,
quantize_weights
,
sort_weights
)
from
vllm.scalar_type
import
scalar_types
from
vllm.scalar_type
import
scalar_types
...
@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
...
@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
assert
max_diff
<
0.04
assert
max_diff
<
0.04
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"qqq"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_QQQ_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_marlin_qqq_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
,
):
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_k
=
k_chunk
*
k_factor
size_n
=
n_chunk
*
n_factor
a_input
=
rand_data
((
size_m
,
size_k
))
b_weight
=
rand_data
((
size_k
,
size_n
))
# Quantize activations
s_a
=
a_input
.
abs
().
max
(
dim
=-
1
,
keepdim
=
True
)[
0
].
div
(
int8_traits
.
max
).
to
(
torch
.
float
)
q_a
=
(
a_input
/
s_a
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
# Quantize weights
w_ref
,
marlin_qqq_q_w
,
marlin_qqq_s_group
,
marlin_qqq_s_channel
=
\
marlin_qqq_quantize
(
b_weight
,
num_bits
,
group_size
)
workspace
=
MarlinWorkspace
(
size_n
,
MARLIN_QQQ_MIN_THREAD_N
,
MARLIN_QQQ_MAX_PARALLEL
)
opcheck
(
torch
.
ops
.
_C
.
marlin_qqq_gemm
,
(
q_a
,
marlin_qqq_q_w
,
s_a
,
marlin_qqq_s_channel
,
marlin_qqq_s_group
,
workspace
.
scratch
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
]))
output
=
ops
.
marlin_qqq_gemm
(
q_a
,
marlin_qqq_q_w
,
s_a
,
marlin_qqq_s_channel
,
marlin_qqq_s_group
,
workspace
.
scratch
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
)
output_ref
=
torch
.
matmul
(
q_a
.
half
()
*
s_a
.
half
(),
w_ref
)
torch
.
cuda
.
synchronize
()
max_diff
=
compute_max_diff
(
output
,
output_ref
)
assert
max_diff
<
0.04
def
test_marlin_gemm_subset_input
():
def
test_marlin_gemm_subset_input
():
quant_type
=
scalar_types
.
uint4b8
quant_type
=
scalar_types
.
uint4b8
group_size
=
128
group_size
=
128
...
@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
...
@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
max_diff
=
compute_max_diff
(
output
,
output_ref
)
max_diff
=
compute_max_diff
(
output
,
output_ref
)
assert
max_diff
<
0.04
assert
max_diff
<
0.04
def
test_marlin_gemm_opcheck
():
size_m
=
2048
size_n
=
4096
size_k
=
4096
a
=
torch
.
rand
((
size_m
,
size_n
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
w
=
torch
.
randint
(
-
5
,
5
,
(
256
,
8192
),
device
=
'cuda'
,
dtype
=
torch
.
int32
)
s
=
torch
.
full
((
32
,
size_k
),
0.125
,
device
=
'cuda'
,
dtype
=
torch
.
float16
)
wk
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
).
scratch
x
=
torch
.
ops
.
_C
.
marlin_gemm
(
a
,
w
,
s
,
wk
,
size_m
,
size_n
,
size_k
)
y
=
torch
.
ops
.
_C
.
marlin_gemm
(
a
,
w
,
s
,
wk
,
size_m
,
size_n
,
size_k
)
torch
.
testing
.
assert_close
(
x
,
y
)
opcheck
(
torch
.
ops
.
_C
.
marlin_gemm
,
(
a
,
w
,
s
,
wk
,
size_m
,
size_n
,
size_k
))
\ No newline at end of file
tests/kernels/quantization/untest_triton_scaled_mm.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_scaled_mm kernel
Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
"""
import
os
import
importlib
from
typing
import
Optional
import
pytest
import
torch
from
vllm.platforms
import
current_platform
from
...utils
import
models_path_prefix
device
=
"cuda"
triton_scaled_mm_module
=
importlib
.
import_module
(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm"
)
triton_scaled_mm
=
triton_scaled_mm_module
.
triton_scaled_mm
def
torch_scaled_mm
(
a
:
torch
.
Tensor
,
b
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
out_dtype
:
type
[
torch
.
dtype
],
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
out
=
torch
.
mm
(
a
.
to
(
torch
.
float32
),
b
.
to
(
torch
.
float32
))
out
=
scale_a
*
out
out
=
scale_b
.
T
*
out
out
=
out
.
to
(
out_dtype
)
if
bias
is
not
None
:
out
=
out
+
bias
return
out
def
get_8bit_types
():
types
=
[
torch
.
int8
]
if
current_platform
.
supports_fp8
():
types
.
append
(
current_platform
.
fp8_dtype
())
return
types
# This test is to check regressions for int8 support on ROCm.
@
pytest
.
mark
.
parametrize
(
"model_path"
,
[
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
),
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"Should only run on ROCm"
)
def
test_rocm_compressed_tensors_w8a8
(
vllm_runner
,
example_prompts
,
model_path
,
max_tokens
,
num_logprobs
):
dtype
=
"bfloat16"
with
vllm_runner
(
model_path
,
dtype
=
dtype
)
as
vllm_model
:
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
MNK_FACTORS
=
[
(
1
,
256
,
128
),
(
33
,
256
,
496
),
(
64
,
971
,
1024
),
(
64
,
20486
,
128
),
(
512
,
256
,
496
),
(
512
,
20486
,
1024
),
]
@
pytest
.
mark
.
parametrize
(
"M,N,K"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"out_dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"in_dtype"
,
get_8bit_types
())
@
pytest
.
mark
.
parametrize
(
"use_scalar_scale_a"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_scalar_scale_b"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
def
test_scaled_mm
(
M
,
N
,
K
,
in_dtype
,
out_dtype
,
use_scalar_scale_a
,
use_scalar_scale_b
,
use_bias
):
is_floating_point_type
=
lambda
t
:
torch
.
tensor
([
1
,
1
],
dtype
=
t
).
is_floating_point
()
current_platform
.
seed_everything
(
0
)
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when
# multiplied when using float16/bfloat16. This means one function, e.g.,
# testing function, and another function, e.g. golden function, can
# produce a non-inf value while the other produces an inf value, and
# will cause assert_close/allclose to fail, even though if overflow
# wouldn't have occurred, the values would have been "close."
#
# So, the values here are kept small enough to avoid this situation.
if
is_floating_point_type
(
in_dtype
):
a
=
(
0.25
*
torch
.
rand
(
(
M
,
K
),
dtype
=
torch
.
float32
,
device
=
device
)).
to
(
in_dtype
)
b
=
(
0.25
*
torch
.
rand
(
(
K
,
N
),
dtype
=
torch
.
float32
,
device
=
device
)).
to
(
in_dtype
)
else
:
a
=
torch
.
randint
(
-
32
,
32
,
(
M
,
K
),
dtype
=
in_dtype
,
device
=
device
)
b
=
torch
.
randint
(
-
32
,
32
,
(
K
,
N
),
dtype
=
in_dtype
,
device
=
device
)
if
use_scalar_scale_a
:
scale_a
=
torch
.
rand
((
1
,
1
),
device
=
device
)
else
:
scale_a
=
0.25
*
torch
.
rand
((
M
,
1
),
device
=
device
)
if
use_scalar_scale_b
:
scale_b
=
torch
.
rand
((
1
,
1
),
device
=
device
)
else
:
scale_b
=
0.25
*
torch
.
rand
((
N
,
1
),
device
=
device
)
bias
=
None
if
use_bias
:
bias
=
torch
.
rand
((
N
,
),
device
=
device
,
dtype
=
out_dtype
)
c_check
=
triton_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
c_actual
=
torch_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
torch
.
testing
.
assert_close
(
c_check
,
c_actual
,
rtol
=
1e-1
,
atol
=
1e-1
)
tests/kernels/test_flex_attention.py
View file @
a99300bd
...
@@ -10,13 +10,19 @@ import pytest
...
@@ -10,13 +10,19 @@ import pytest
import
torch
import
torch
from
packaging
import
version
from
packaging
import
version
from
vllm
import
SamplingParams
from
tests.v1.attention.utils
import
(
BatchSpec
,
create_common_attn_metadata
,
create_standard_kv_cache_spec
,
create_vllm_config
)
from
vllm.v1.attention.backends.flex_attention
import
(
FlexAttentionMetadataBuilder
)
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
..models.utils
import
check_embeddings_close
from
..models.utils
import
check_embeddings_close
,
check_logprobs_close
TORCH_VERSION
=
version
.
parse
(
torch
.
__version__
)
TORCH_VERSION
=
version
.
parse
(
torch
.
__version__
)
MINIMUM_TORCH_VERSION
=
version
.
parse
(
"2.7.0"
)
MINIMUM_TORCH_VERSION
=
version
.
parse
(
"2.7.0"
)
DIRECT_BUILD_VERSION
=
version
.
parse
(
"2.9.dev0"
)
def
set_seed
(
seed
):
def
set_seed
(
seed
):
...
@@ -36,22 +42,18 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -36,22 +42,18 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
"""Test that FlexAttention produces the same outputs as the default backend.
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
This test compares the outputs from the FlexAttention backend with
the default backend, ensuring they are
identical
when using the same seed.
the default backend, ensuring they are
similar
when using the same seed.
"""
"""
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
seed
=
42
seed
=
42
max_tokens
=
24
max_tokens
=
24
num_logprobs
=
5
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
"The president of the United States is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The capital of France is"
,
]
]
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
,
seed
=
seed
,
max_tokens
=
max_tokens
)
# Run with flex attention
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
...
@@ -63,7 +65,8 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -63,7 +65,8 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
)
as
llm_flex
:
enforce_eager
=
True
)
as
llm_flex
:
output_flex
=
llm_flex
.
generate
(
prompts
,
sampling_params
)
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Run with default backend
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
@@ -73,20 +76,17 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -73,20 +76,17 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
runner
=
"generate"
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
)
as
llm_default
:
enforce_eager
=
True
,
output_default
=
llm_default
.
generate
(
prompts
,
sampling_params
)
gpu_memory_utilization
=
0.85
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
# Compare outputs from both backends
prompts
,
max_tokens
,
num_logprobs
)
for
i
,
(
flex_result
,
default_result
)
in
enumerate
(
zip
(
output_flex
,
output_default
)):
check_logprobs_close
(
prompt
=
prompts
[
i
]
outputs_0_lst
=
output_flex
,
flex_text
=
flex_result
[
1
][
0
]
outputs_1_lst
=
output_default
,
default_text
=
default_result
[
1
][
0
]
name_0
=
"flex"
,
name_1
=
"default"
,
assert
flex_text
==
default_text
,
(
)
f
"FlexAttention output doesn't match default for:
{
prompt
!
r
}
\n
"
f
"FlexAttention:
{
flex_text
!
r
}
\n
"
f
"Default:
{
default_text
!
r
}
"
)
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
...
@@ -138,5 +138,70 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -138,5 +138,70 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
)
)
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
DIRECT_BUILD_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_block_mask_direct_vs_slow_path
():
"""Test that direct path block mask is a superset of slow path.
The direct path may include extra blocks for performance (over-estimation),
but must include all blocks that the slow path determines are necessary.
"""
device
=
torch
.
device
(
"cuda"
)
vllm_config
=
create_vllm_config
(
model_name
=
"meta-llama/Meta-Llama-3-8B"
,
block_size
=
16
,
max_model_len
=
1024
)
kv_cache_spec
=
create_standard_kv_cache_spec
(
vllm_config
)
# Use a mixed batch that will create groups spanning multiple sequences
batch_spec
=
BatchSpec
(
seq_lens
=
[
35
,
64
,
128
,
256
],
query_lens
=
[
33
,
5
,
32
,
64
],
name
=
"test_mixed_batch"
)
common_attn_metadata
=
create_common_attn_metadata
(
batch_spec
,
vllm_config
.
cache_config
.
block_size
,
device
)
builder
=
FlexAttentionMetadataBuilder
(
kv_cache_spec
,
[],
vllm_config
,
device
)
metadata_direct
=
builder
.
build
(
common_prefix_len
=
0
,
common_attn_metadata
=
common_attn_metadata
)
builder
.
direct_build
=
False
metadata_slow
=
builder
.
build
(
common_prefix_len
=
0
,
common_attn_metadata
=
common_attn_metadata
)
assert
metadata_direct
.
block_mask
is
not
None
assert
metadata_slow
.
block_mask
is
not
None
# Extract block indices for comparison, B, H are the same
direct_indices
=
metadata_direct
.
block_mask
.
kv_indices
[
0
,
0
]
slow_indices
=
metadata_slow
.
block_mask
.
kv_indices
[
0
,
0
]
direct_num
=
metadata_direct
.
block_mask
.
kv_num_blocks
[
0
,
0
]
slow_num
=
metadata_slow
.
block_mask
.
kv_num_blocks
[
0
,
0
]
# main test: every block needed by slow path must be in direct path
num_groups
=
direct_num
.
shape
[
0
]
all_contained
=
True
missing_details
=
[]
for
group_idx
in
range
(
num_groups
):
direct_blocks
=
set
(
direct_indices
[
group_idx
,
:
direct_num
[
group_idx
]].
tolist
())
slow_blocks
=
set
(
slow_indices
[
group_idx
,
:
slow_num
[
group_idx
]].
tolist
())
missing_blocks
=
slow_blocks
-
direct_blocks
if
missing_blocks
:
all_contained
=
False
missing_details
.
append
(
f
"Group
{
group_idx
}
: missing
{
sorted
(
missing_blocks
)
}
"
)
assert
all_contained
,
(
"Direct path is missing blocks required by slow path:
\n
"
+
"
\n
"
.
join
(
missing_details
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
pytest
.
main
([
__file__
])
tests/kernels/test_onednn.py
0 → 100644
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for FlexAttention backend vs default backend"""
from
typing
import
Optional
import
pytest
import
torch
from
tests.kernels.utils
import
to_int8
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
if
not
current_platform
.
is_cpu
():
pytest
.
skip
(
"skipping CPU-only tests"
,
allow_module_level
=
True
)
NK_FACTORS
=
[
(
256
,
128
),
(
4096
,
4096
),
(
16384
,
4096
),
(
1023
,
491
),
(
1001
,
15
),
]
M_FACTORS
=
[
(
16
,
1
,
32
,
128
,
64
),
(
1
,
17
,
1
,
31
,
17
),
]
CACHE_SIZES
=
[
2
]
DTYPE
=
[
torch
.
bfloat16
]
def
rand_int8
(
shape
:
tuple
,
device
:
str
=
"cpu"
):
return
to_int8
(
torch
.
rand
(
shape
,
device
=
device
)
*
255
-
128
)
def
ref_int8_scaled_mm
(
a
:
torch
.
Tensor
,
b
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
azp
:
Optional
[
torch
.
Tensor
],
bias
:
Optional
[
torch
.
Tensor
],
output_type
:
torch
.
dtype
,
):
if
azp
is
not
None
:
a
=
a
.
to
(
dtype
=
torch
.
float32
)
-
azp
.
to
(
dtype
=
torch
.
float32
)
output
=
torch
.
mm
((
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
)),
(
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)))
if
bias
is
not
None
:
output
+=
bias
.
float
()
return
output
.
to
(
dtype
=
output_type
)
def
onednn_int8_gemm_test_helper
(
primitive_cache_size
:
int
,
m
:
int
,
n
:
int
,
k
:
int
,
per_tensor_a_quant
:
bool
,
per_tensor_b_quant
:
bool
,
use_azp
:
bool
,
use_bias
:
bool
,
out_dtype
:
torch
.
dtype
=
torch
.
bfloat16
,
device
:
str
=
"cpu"
):
# Test for a oneDNN kernel with per-tensor / per-token activation
# quantization and per-tensor / per-output channel weight quantization.
a
=
to_int8
(
torch
.
randn
((
m
,
k
),
device
=
device
)
*
5
)
b
=
to_int8
(
torch
.
randn
((
n
,
k
),
device
=
device
).
t
()
*
5
)
a_scales_shape
=
(
1
,
1
)
if
per_tensor_a_quant
else
(
m
,
1
)
b_scales_shape
=
(
1
,
1
)
if
per_tensor_b_quant
else
(
1
,
n
)
scale_a
=
(
torch
.
randn
(
a_scales_shape
,
device
=
device
,
dtype
=
torch
.
float32
))
scale_b
=
(
torch
.
randn
(
b_scales_shape
,
device
=
device
,
dtype
=
torch
.
float32
))
if
use_azp
:
azp
=
torch
.
rand
(
a_scales_shape
,
dtype
=
torch
.
float32
)
*
10
+
1.5
azp
=
(
azp
/
scale_a
).
round
().
to
(
dtype
=
torch
.
int32
)
azp_adj
=
scale_b
*
b
.
sum
(
dim
=
0
,
keepdim
=
True
,
dtype
=
torch
.
float32
)
else
:
azp
=
None
azp_adj
=
None
if
use_bias
:
bias
=
torch
.
rand
((
n
,
),
device
=
device
,
dtype
=
out_dtype
)
*
10
else
:
bias
=
None
handler
=
ops
.
create_onednn_scaled_mm
(
b
,
scale_b
,
out_dtype
,
not
per_tensor_a_quant
,
use_azp
,
primitive_cache_size
,
)
out
=
torch
.
zeros
((
m
,
n
),
dtype
=
out_dtype
)
ops
.
onednn_scaled_mm
(
handler
,
a
,
out
,
scale_a
,
azp
,
azp_adj
,
bias
)
baseline
=
ref_int8_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
azp
,
bias
,
out_dtype
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
if
use_bias
:
# To test runtime bias setting
out
=
torch
.
zeros
((
m
,
n
),
dtype
=
out_dtype
)
ops
.
onednn_scaled_mm
(
handler
,
a
,
out
,
scale_a
,
azp
,
azp_adj
,
None
)
baseline
=
ref_int8_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
azp
,
None
,
out_dtype
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
@
pytest
.
mark
.
parametrize
(
"n,k"
,
NK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"m_list"
,
M_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"per_tensor_a_scale"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_tensor_b_scale"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_azp"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"output_type"
,
DTYPE
)
@
pytest
.
mark
.
parametrize
(
"primitive_cache_size"
,
CACHE_SIZES
)
def
test_onednn_int8_scaled_gemm
(
n
:
int
,
k
:
int
,
m_list
:
tuple
[
int
],
per_tensor_a_scale
:
bool
,
per_tensor_b_scale
:
bool
,
use_bias
:
bool
,
use_azp
:
bool
,
output_type
:
torch
.
dtype
,
primitive_cache_size
:
int
,
):
for
m
in
m_list
:
onednn_int8_gemm_test_helper
(
primitive_cache_size
=
primitive_cache_size
,
m
=
m
,
n
=
n
,
k
=
k
,
per_tensor_a_quant
=
per_tensor_a_scale
,
per_tensor_b_quant
=
per_tensor_b_scale
,
use_bias
=
use_bias
,
use_azp
=
use_azp
,
out_dtype
=
output_type
,
)
tests/lora/conftest.py
View file @
a99300bd
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
tempfile
import
tempfile
from
collections
import
OrderedDict
from
collections
import
OrderedDict
from
unittest.mock
import
MagicMock
,
patch
from
unittest.mock
import
MagicMock
import
pytest
import
pytest
import
os
import
os
...
@@ -11,8 +11,6 @@ import torch
...
@@ -11,8 +11,6 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
import
vllm
from
vllm.config
import
LoRAConfig
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
init_distributed_environment
,
initialize_model_parallel
)
initialize_model_parallel
)
...
@@ -22,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...
@@ -22,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.models.interfaces
import
SupportsLoRA
from
vllm.model_executor.models.interfaces
import
SupportsLoRA
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
...
@@ -106,6 +103,7 @@ def dummy_model() -> nn.Module:
...
@@ -106,6 +103,7 @@ def dummy_model() -> nn.Module:
]))
]))
model
.
config
=
MagicMock
()
model
.
config
=
MagicMock
()
model
.
embedding_modules
=
{
"lm_head"
:
"lm_head"
}
model
.
embedding_modules
=
{
"lm_head"
:
"lm_head"
}
model
.
unpadded_vocab_size
=
32000
return
model
return
model
...
@@ -139,6 +137,8 @@ def dummy_model_gate_up() -> nn.Module:
...
@@ -139,6 +137,8 @@ def dummy_model_gate_up() -> nn.Module:
],
],
}
}
model
.
embedding_modules
=
{
"lm_head"
:
"lm_head"
}
model
.
embedding_modules
=
{
"lm_head"
:
"lm_head"
}
model
.
unpadded_vocab_size
=
32000
return
model
return
model
...
@@ -223,40 +223,6 @@ def tinyllama_lora_files():
...
@@ -223,40 +223,6 @@ def tinyllama_lora_files():
return
os
.
path
.
join
(
models_path_prefix
,
"jashing/tinyllama-colorist-lora"
)
return
os
.
path
.
join
(
models_path_prefix
,
"jashing/tinyllama-colorist-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
phi2_lora_files
():
# return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
return
os
.
path
.
join
(
models_path_prefix
,
"isotr0py/phi-2-test-sql-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen_lora_files
():
# return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
return
os
.
path
.
join
(
models_path_prefix
,
"customize/qwen-nl2dsl-lora"
)
@
pytest
.
fixture
def
llama_2_7b_engine_extra_embeddings
():
cleanup_dist_env_and_memory
(
shutdown_ray
=
True
)
get_model_old
=
get_model
def
get_model_patched
(
**
kwargs
):
kwargs
[
"vllm_config"
].
lora_config
=
LoRAConfig
(
max_loras
=
4
,
max_lora_rank
=
8
)
return
get_model_old
(
**
kwargs
)
with
patch
(
"vllm.worker.model_runner.get_model"
,
get_model_patched
):
engine
=
vllm
.
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-hf"
),
enable_lora
=
False
)
yield
engine
.
llm_engine
del
engine
cleanup_dist_env_and_memory
(
shutdown_ray
=
True
)
@
pytest
.
fixture
def
llama_2_7b_model_extra_embeddings
(
llama_2_7b_engine_extra_embeddings
):
yield
(
llama_2_7b_engine_extra_embeddings
.
model_executor
.
driver_worker
.
model_runner
.
model
)
@
pytest
.
fixture
@
pytest
.
fixture
def
reset_default_device
():
def
reset_default_device
():
"""
"""
...
...
tests/lora/test_add_lora.py
View file @
a99300bd
...
@@ -6,7 +6,6 @@ import time
...
@@ -6,7 +6,6 @@ import time
import
os
import
os
import
pytest
import
pytest
import
vllm.envs
as
env
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.openai.api_server
import
(
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
build_async_engine_client_from_engine_args
)
...
@@ -100,12 +99,10 @@ async def test_add_lora(chatglm3_lora_files):
...
@@ -100,12 +99,10 @@ async def test_add_lora(chatglm3_lora_files):
# Run with warmup
# Run with warmup
add_lora_tasks
=
[
llm
.
add_lora
(
lr
)
for
lr
in
warmup_run_requests
]
add_lora_tasks
=
[
llm
.
add_lora
(
lr
)
for
lr
in
warmup_run_requests
]
add_lora_results
=
await
asyncio
.
gather
(
*
add_lora_tasks
)
add_lora_results
=
await
asyncio
.
gather
(
*
add_lora_tasks
)
if
env
.
VLLM_USE_V1
:
# Test that all all_lora calls are successful.
# Test that all all_lora calls are successful.
assert
all
(
add_lora_results
)
assert
all
(
add_lora_results
)
else
:
# No way to check V0 engine results as the calls just return None.
pass
time_with_add_lora
=
await
requests_processing_time
(
time_with_add_lora
=
await
requests_processing_time
(
llm
,
warmup_run_requests
)
llm
,
warmup_run_requests
)
...
...
tests/lora/test_chatglm3_tp.py
View file @
a99300bd
...
@@ -89,6 +89,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
...
@@ -89,6 +89,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@
multi_gpu_test
(
num_gpus
=
4
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
def
test_chatglm3_lora_tp4_fully_sharded_loras
(
chatglm3_lora_files
):
def
test_chatglm3_lora_tp4_fully_sharded_loras
(
chatglm3_lora_files
):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
# more GPU memory causing vLLM to OOM
llm
=
vllm
.
LLM
(
MODEL_PATH
,
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
max_model_len
=
1024
,
enable_lora
=
True
,
enable_lora
=
True
,
...
@@ -97,7 +100,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
...
@@ -97,7 +100,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
tensor_parallel_size
=
4
,
tensor_parallel_size
=
4
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
fully_sharded_loras
=
True
,
fully_sharded_loras
=
True
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
,
gpu_memory_utilization
=
0.85
)
output1
=
do_sample
(
llm
,
chatglm3_lora_files
,
lora_id
=
1
)
output1
=
do_sample
(
llm
,
chatglm3_lora_files
,
lora_id
=
1
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
...
...
tests/lora/test_layers.py
View file @
a99300bd
...
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
...
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
...
@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
...
@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
# @pytest.mark.skip(
# @pytest.mark.skip(
# reason="Fails when loras are in any slot other than the first.")
# reason="Fails when loras are in any slot other than the first.")
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
...
@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
...
@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
256512
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
256512
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
...
@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
...
@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"bias_enabled"
,
[
True
,
False
])
def
test_linear_replicated
(
def
test_linear_replicated
(
dist_init
,
num_loras
,
device
,
stage
,
dist_init
,
bias_enabled
)
->
None
:
num_loras
,
device
,
stage
,
)
->
None
:
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
...
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
...
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
max_loras
,
lora_dtype
=
torch
.
float16
,
max_lora_rank
=
8
,
bias_enabled
=
bias_enabled
)
lora_dtype
=
torch
.
float16
,
)
def
create_random_linear_replicated_layer
():
def
create_random_linear_replicated_layer
():
...
@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
...
@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
assert
(
lora_linear
.
n_slices
==
len
(
lora_linear
.
lora_a_stacked
)
==
len
(
assert
(
lora_linear
.
n_slices
==
len
(
lora_linear
.
lora_a_stacked
)
==
len
(
lora_linear
.
lora_b_stacked
)
==
1
)
lora_linear
.
lora_b_stacked
)
==
1
)
if
bias_enabled
:
assert
len
(
lora_linear
.
lora_bias_stacked
)
==
lora_linear
.
n_slices
else
:
assert
lora_linear
.
lora_bias_stacked
is
None
return
linear
,
lora_linear
return
linear
,
lora_linear
for
i
in
range
(
NUM_RANDOM_SEEDS
):
for
i
in
range
(
NUM_RANDOM_SEEDS
):
...
@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
...
@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"orientation"
,
[
"row"
,
"column"
])
@
pytest
.
mark
.
parametrize
(
"orientation"
,
[
"row"
,
"column"
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"bias_enabled"
,
[
True
,
False
])
def
test_linear_parallel
(
dist_init
,
num_loras
,
orientation
,
fully_shard
,
def
test_linear_parallel
(
dist_init
,
num_loras
,
orientation
,
fully_shard
,
device
,
stage
,
bias_enabled
)
->
None
:
device
,
stage
)
->
None
:
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
...
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
...
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
max_loras
,
fully_sharded_loras
=
fully_shard
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
,
fully_sharded_loras
=
fully_shard
,
bias_enabled
=
bias_enabled
)
lora_dtype
=
torch
.
float16
,
)
def
create_random_linear_parallel_layer
():
def
create_random_linear_parallel_layer
():
if
orientation
==
"row"
:
if
orientation
==
"row"
:
...
@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
...
@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
assert
(
lora_linear
.
n_slices
==
len
(
lora_linear
.
lora_a_stacked
)
==
len
(
assert
(
lora_linear
.
n_slices
==
len
(
lora_linear
.
lora_a_stacked
)
==
len
(
lora_linear
.
lora_b_stacked
)
==
1
)
lora_linear
.
lora_b_stacked
)
==
1
)
if
bias_enabled
:
assert
len
(
lora_linear
.
lora_bias_stacked
)
==
lora_linear
.
n_slices
else
:
assert
lora_linear
.
lora_bias_stacked
is
None
return
linear
,
lora_linear
return
linear
,
lora_linear
for
i
in
range
(
NUM_RANDOM_SEEDS
):
for
i
in
range
(
NUM_RANDOM_SEEDS
):
...
@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
...
@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"repeats"
,
[
1
,
2
,
3
])
@
pytest
.
mark
.
parametrize
(
"repeats"
,
[
1
,
2
,
3
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"bias_enabled"
,
[
True
,
False
])
def
test_column_parallel_packed
(
dist_init
,
num_loras
,
repeats
,
fully_shard
,
def
test_column_parallel_packed
(
dist_init
,
num_loras
,
repeats
,
fully_shard
,
device
,
stage
,
bias_enabled
)
->
None
:
device
,
stage
)
->
None
:
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
...
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
...
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
max_loras
,
fully_sharded_loras
=
fully_shard
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
,
fully_sharded_loras
=
fully_shard
,
bias_enabled
=
bias_enabled
)
lora_dtype
=
torch
.
float16
,
)
def
create_column_parallel_packed_layer
():
def
create_column_parallel_packed_layer
():
if
repeats
==
2
:
if
repeats
==
2
:
...
@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
...
@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
model_config
=
FakeConfig
())
model_config
=
FakeConfig
())
assert
(
lora_linear
.
n_slices
==
len
(
lora_linear
.
lora_a_stacked
)
==
len
(
assert
(
lora_linear
.
n_slices
==
len
(
lora_linear
.
lora_a_stacked
)
==
len
(
lora_linear
.
lora_b_stacked
)
==
n_slices
)
lora_linear
.
lora_b_stacked
)
==
n_slices
)
if
bias_enabled
:
assert
len
(
lora_linear
.
lora_bias_stacked
)
==
lora_linear
.
n_slices
else
:
assert
lora_linear
.
lora_bias_stacked
is
None
return
linear
,
lora_linear
return
linear
,
lora_linear
for
i
in
range
(
NUM_RANDOM_SEEDS
):
for
i
in
range
(
NUM_RANDOM_SEEDS
):
...
...
tests/lora/test_llama_tp.py
View file @
a99300bd
...
@@ -114,8 +114,7 @@ def test_llama_lora(sql_lora_files):
...
@@ -114,8 +114,7 @@ def test_llama_lora(sql_lora_files):
enable_lora
=
True
,
enable_lora
=
True
,
# also test odd max_num_seqs
# also test odd max_num_seqs
max_num_seqs
=
13
,
max_num_seqs
=
13
,
max_loras
=
4
,
max_loras
=
4
)
enable_chunked_prefill
=
True
)
generate_and_test
(
llm
,
sql_lora_files
)
generate_and_test
(
llm
,
sql_lora_files
)
...
@@ -129,7 +128,6 @@ def test_llama_lora_tp4(sql_lora_files):
...
@@ -129,7 +128,6 @@ def test_llama_lora_tp4(sql_lora_files):
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_loras
=
4
,
tensor_parallel_size
=
4
,
tensor_parallel_size
=
4
,
enable_chunked_prefill
=
True
,
)
)
generate_and_test
(
llm
,
sql_lora_files
)
generate_and_test
(
llm
,
sql_lora_files
)
...
@@ -145,7 +143,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
...
@@ -145,7 +143,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
max_loras
=
4
,
max_loras
=
4
,
tensor_parallel_size
=
4
,
tensor_parallel_size
=
4
,
fully_sharded_loras
=
True
,
fully_sharded_loras
=
True
,
enable_chunked_prefill
=
True
,
)
)
generate_and_test
(
llm
,
sql_lora_files
)
generate_and_test
(
llm
,
sql_lora_files
)
...
...
tests/lora/test_multi_loras
_with_tp
.py
→
tests/lora/test_
llm_with_
multi_loras.py
View file @
a99300bd
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
Script to test multi loras service with tp >= 2
This script contains:
1. test multi loras service with tp >= 2
2. test multi loras request
"""
"""
import
pytest
from
tests.utils
import
multi_gpu_test
from
tests.utils
import
multi_gpu_test
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
...
@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():
...
@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():
output_text
=
call_llm_get_outputs
(
prompt
,
"Alice"
)
output_text
=
call_llm_get_outputs
(
prompt
,
"Alice"
)
check_outputs
(
output_text
,
expected_output
)
check_outputs
(
output_text
,
expected_output
)
def
test_multiple_lora_requests
():
llm
=
LLM
(
model
=
MODEL_PATH
,
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
LORA_RANK
,
max_model_len
=
512
,
gpu_memory_utilization
=
0.5
,
enforce_eager
=
True
,
)
PROMPTS
=
[
"Hello, my name is"
]
*
2
LORA_NAME
=
"Alice"
lora_request
=
[
LoRARequest
(
LORA_NAME
+
str
(
idx
),
idx
+
1
,
LORA_NAME_PATH_MAP
[
LORA_NAME
])
for
idx
in
range
(
len
(
PROMPTS
))
]
# Multiple SamplingParams should be matched with each prompt
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
# Exception raised, if the size of params does not match the size of prompts
with
pytest
.
raises
(
ValueError
):
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
[:
1
])
# Single LoRARequest should be applied to every prompt
single_lora_request
=
lora_request
[
0
]
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
single_lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
Prev
1
…
9
10
11
12
13
14
15
16
17
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment