Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
377 additions
and
151 deletions
+377
-151
tests/kernels/quantization/test_int8_quant.py
tests/kernels/quantization/test_int8_quant.py
+5
-5
tests/kernels/quantization/test_mxfp4_qutlass.py
tests/kernels/quantization/test_mxfp4_qutlass.py
+2
-1
tests/kernels/quantization/test_nvfp4_qutlass.py
tests/kernels/quantization/test_nvfp4_qutlass.py
+2
-1
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+3
-1
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+3
-1
tests/kernels/quantization/untest_block_fp8.py
tests/kernels/quantization/untest_block_fp8.py
+51
-0
tests/kernels/quantization/untest_fp8_quant.py
tests/kernels/quantization/untest_fp8_quant.py
+107
-6
tests/kernels/quantization/untest_nvfp4_quant.py
tests/kernels/quantization/untest_nvfp4_quant.py
+3
-2
tests/kernels/quantization/untest_nvfp4_scaled_mm.py
tests/kernels/quantization/untest_nvfp4_scaled_mm.py
+2
-1
tests/kernels/test_apply_repetition_penalties.py
tests/kernels/test_apply_repetition_penalties.py
+3
-2
tests/kernels/test_fla_layernorm_guard.py
tests/kernels/test_fla_layernorm_guard.py
+8
-8
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+44
-51
tests/kernels/untest_fused_quant_activation.py
tests/kernels/untest_fused_quant_activation.py
+1
-0
tests/kernels/utils.py
tests/kernels/utils.py
+1
-1
tests/lora/conftest.py
tests/lora/conftest.py
+27
-2
tests/lora/test_fused_moe_lora_kernel.py
tests/lora/test_fused_moe_lora_kernel.py
+4
-4
tests/lora/test_gptoss_tp.py
tests/lora/test_gptoss_tp.py
+50
-34
tests/lora/test_layers.py
tests/lora/test_layers.py
+19
-16
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+8
-1
tests/lora/test_lora_manager.py
tests/lora/test_lora_manager.py
+34
-14
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/kernels/quantization/test_int8_quant.py
View file @
7e63ef82
...
...
@@ -7,7 +7,7 @@ import torch
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
tests.kernels.utils
import
opcheck
from
vllm._custom_ops
import
scaled_int8_quant
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
...
...
@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
...
@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant(
def
test_dynamic_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
...
...
@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant(
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
...
@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant(
scale
:
float
,
azp
:
int
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
...
...
tests/kernels/quantization/test_mxfp4_qutlass.py
View file @
7e63ef82
...
...
@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from
vllm._custom_ops
import
fusedQuantizeMx
,
matmul_mxf4_bf16_tn
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
...
...
@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@
pytest
.
fixture
(
autouse
=
True
)
def
_seed_each_test
():
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
np
.
random
.
seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
...
...
tests/kernels/quantization/test_nvfp4_qutlass.py
View file @
7e63ef82
...
...
@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from
vllm._custom_ops
import
fusedQuantizeNv
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
...
...
@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@
pytest
.
fixture
(
autouse
=
True
)
def
_seed_each_test
():
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
np
.
random
.
seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
...
...
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
View file @
7e63ef82
...
...
@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from
vllm._custom_ops
import
scaled_fp4_quant
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -30,10 +31,11 @@ BLOCK_SIZE = 16
@
pytest
.
mark
.
parametrize
(
"shape"
,
SHAPES
)
@
torch
.
inference_mode
()
def
test_silu_mul_nvfp4_quant
(
default_vllm_config
,
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
int
],
)
->
None
:
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
"cuda:0"
torch
.
set_default_device
(
device
)
...
...
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
7e63ef82
...
...
@@ -11,7 +11,9 @@ import pytest
import
torch
from
vllm.platforms
import
current_platform
from
...utils
import
models_path_prefix
from
vllm.utils.torch_utils
import
set_random_seed
device
=
"cuda"
...
...
@@ -86,7 +88,7 @@ def test_scaled_mm(
):
is_floating_point_type
=
lambda
t
:
torch
.
tensor
([
1
,
1
],
dtype
=
t
).
is_floating_point
()
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when
...
...
tests/kernels/quantization/untest_block_fp8.py
View file @
7e63ef82
...
...
@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import (
per_block_cast_to_fp8
,
should_use_deepgemm_for_fp8_linear
,
)
from
vllm.utils.flashinfer
import
(
flashinfer_fp8_blockscale_gemm
,
has_flashinfer_fp8_blockscale_gemm
,
)
from
vllm.utils.import_utils
import
has_deep_gemm
if
current_platform
.
get_device_capability
()
<
(
9
,
0
):
...
...
@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
))
)
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
)))
assert
rel_diff
<
0.001
@
pytest
.
mark
.
skipif
(
current_platform
.
is_fp8_fnuz
(),
reason
=
"This platform supports e4m3fnuz, not e4m3fn."
,
)
@
pytest
.
mark
.
parametrize
(
"M,N,K,block_size,out_dtype,seed"
,
itertools
.
product
(
M
,
N
,
K
,
BLOCK_SIZE
,
OUT_DTYPES
,
SEEDS
),
)
@
torch
.
inference_mode
()
def
test_w8a8_block_fp8_flashinfer_matmul
(
M
,
N
,
K
,
block_size
,
out_dtype
,
seed
):
if
not
has_flashinfer_fp8_blockscale_gemm
():
pytest
.
skip
(
"FlashInfer block GEMM not available (requires SM90+ and FlashInfer)"
)
# only aligned sizes
if
K
%
128
!=
0
or
N
%
64
!=
0
:
pytest
.
skip
(
f
"Skipping test; invalid size
{
M
}
,
{
N
}
,
{
K
}
"
)
torch
.
manual_seed
(
seed
)
fp8_info
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
=
fp8_info
.
max
A_bf16
=
(
torch
.
rand
(
M
,
K
,
dtype
=
torch
.
bfloat16
)
-
0.5
)
*
2
*
fp8_max
B_bf16
=
(
torch
.
rand
(
N
,
K
,
dtype
=
torch
.
bfloat16
)
-
0.5
)
*
2
*
fp8_max
A_fp8
,
As_fp8
=
per_token_group_quant_fp8
(
A_bf16
,
block_size
[
1
],
use_ue8m0
=
False
)
B_fp8
,
Bs_fp8
=
per_block_cast_to_fp8
(
B_bf16
,
block_size
,
use_ue8m0
=
False
)
As
=
As_fp8
.
to
(
torch
.
float32
)
Bs
=
Bs_fp8
.
to
(
torch
.
float32
)
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
out
=
flashinfer_fp8_blockscale_gemm
(
input
=
A_bf16
,
weight
=
B_fp8
,
input_scale
=
None
,
weight_scale
=
Bs
,
out_dtype
=
out_dtype
,
)
rel_diff
=
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
bfloat16
)
-
ref_out
.
to
(
torch
.
bfloat16
))
)
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
bfloat16
)))
assert
rel_diff
<
0.001
tests/kernels/quantization/untest_fp8_quant.py
View file @
7e63ef82
...
...
@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant
,
)
from
tests.kernels.utils
import
opcheck
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
scaled_quantize
,
)
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
...
...
@@ -21,10 +25,18 @@ SEEDS = [0]
def
opcheck_fp8_quant
(
output
,
input
,
scale
=
None
,
scale_ub
=
None
,
use_per_token_if_dynamic
=
False
output
,
input
,
scale
=
None
,
scale_ub
=
None
,
use_per_token_if_dynamic
=
False
,
group_shape
=
None
,
):
if
scale
is
not
None
:
opcheck
(
torch
.
ops
.
_C
.
static_scaled_fp8_quant
,
(
output
,
input
,
scale
))
opcheck
(
torch
.
ops
.
_C
.
static_scaled_fp8_quant
,
(
output
,
input
,
scale
,
group_shape
),
)
elif
use_per_token_if_dynamic
:
scale
=
torch
.
empty
(
(
input
.
shape
[
0
],
1
),
device
=
input
.
device
,
dtype
=
torch
.
float32
...
...
@@ -51,7 +63,7 @@ def opcheck_fp8_quant(
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
(
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
...
...
@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant(
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
...
...
@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant(
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
...
...
@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None:
ref_out
=
ref_out
.
to
(
dtype
=
dtype
)
ops_out
=
ops_out
.
to
(
dtype
=
dtype
)
torch
.
testing
.
assert_close
(
ref_out
,
ops_out
)
\ No newline at end of file
torch
.
testing
.
assert_close
(
ref_out
,
ops_out
)
# Test static FP8 quantization with 2D group scales
GROUP_SHAPES_2D
=
[
(
-
1
,
-
1
),
# Per-tensor
(
-
1
,
1
),
# Per-channel
(
1
,
-
1
),
# Per-token
(
-
1
,
128
),
# Per-head quantization
(
1
,
128
),
# DeepSeek-style per-token-per-group (group_m=1, group_n=128)
(
128
,
128
),
# DeepSeek-style block quantization
(
1
,
64
),
# Smaller group size
(
1
,
16
),
# Small group (scalar path in kernel)
(
4
,
256
),
# Non-trivial both dimensions
]
# Use sizes divisible by all group shapes
NUM_TOKENS_GROUP
=
[
128
,
512
]
HIDDEN_SIZES_GROUP
=
[
256
,
1024
,
2048
]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS_GROUP
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES_GROUP
)
@
pytest
.
mark
.
parametrize
(
"group_shape"
,
GROUP_SHAPES_2D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_static_fp8_quant_group_2d
(
num_tokens
:
int
,
hidden_size
:
int
,
group_shape
:
tuple
[
int
,
int
],
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
"""Test static FP8 quantization with 2D group scales using scaled_quantize."""
# Normalize group_shape (-1 means full extent)
norm_group_m
=
num_tokens
if
group_shape
[
0
]
==
-
1
else
group_shape
[
0
]
norm_group_n
=
hidden_size
if
group_shape
[
1
]
==
-
1
else
group_shape
[
1
]
# Skip if sizes are not divisible by group shape
if
num_tokens
%
norm_group_m
!=
0
or
hidden_size
%
norm_group_n
!=
0
:
pytest
.
skip
(
f
"Skipping: (
{
num_tokens
}
,
{
hidden_size
}
) not divisible by "
f
"group_shape (
{
group_shape
[
0
]
}
,
{
group_shape
[
1
]
}
)"
)
current_platform
.
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale
=
scaled_quantize
(
x
,
group_shape
,
FP8_DTYPE
,
compute_dtype
=
torch
.
float32
)
ops_out
,
ops_scale
=
ops
.
scaled_fp8_quant
(
x
,
scale
=
scale
,
group_shape
=
group_shape
)
torch
.
testing
.
assert_close
(
scale
,
ops_scale
)
torch
.
testing
.
assert_close
(
ref_out
.
float
(),
ops_out
.
float
(),
rtol
=
0.12
,
atol
=
0.0
)
opcheck_fp8_quant
(
ops_out
,
x
,
scale
=
scale
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS_GROUP
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES_GROUP
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"group_shape"
,
[(
1
,
-
1
),
(
-
1
,
1
)])
# per-token, per-channel
@
torch
.
inference_mode
()
def
test_static_fp8_quant_1d_scale
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
group_shape
:
tuple
[
int
,
int
],
)
->
None
:
"""Test static FP8 quantization with 1D scale (per-token or per-channel)."""
current_platform
.
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale_2d
=
scaled_quantize
(
x
,
group_shape
,
FP8_DTYPE
,
compute_dtype
=
torch
.
float32
)
# Flatten scale to 1D for testing 1D scale path
scale_1d
=
scale_2d
.
flatten
()
ops_out
,
ops_scale
=
ops
.
scaled_fp8_quant
(
x
,
scale
=
scale_1d
,
group_shape
=
group_shape
)
torch
.
testing
.
assert_close
(
scale_1d
,
ops_scale
)
torch
.
testing
.
assert_close
(
ref_out
.
float
(),
ops_out
.
float
(),
rtol
=
0.12
,
atol
=
0.0
)
opcheck_fp8_quant
(
ops_out
,
x
,
scale
=
scale_1d
,
group_shape
=
group_shape
)
tests/kernels/quantization/untest_nvfp4_quant.py
View file @
7e63ef82
...
...
@@ -6,6 +6,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed
:
int
,
device
:
str
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
m
,
n
=
shape
...
...
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@
torch
.
inference_mode
()
def
test_quantize_to_fp4_padded
(
pad_shape
:
tuple
[
int
,
int
])
->
None
:
dtype
=
torch
.
float16
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
torch
.
set_default_device
(
"cuda:0"
)
m
,
n
=
pad_shape
...
...
tests/kernels/quantization/untest_nvfp4_scaled_mm.py
View file @
7e63ef82
...
...
@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
...
...
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed
:
int
,
device
:
str
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
m
,
n
,
packed_k
=
shape
k
=
packed_k
*
2
block_size
=
16
...
...
tests/kernels/test_apply_repetition_penalties.py
View file @
7e63ef82
...
...
@@ -9,6 +9,7 @@ from vllm._custom_ops import (
apply_repetition_penalties_torch
,
)
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
NUM_SEQS
=
[
1
,
2
,
3
,
4
,
8
,
13
,
17
,
32
,
37
,
256
,
1023
,
1024
,
1025
]
# [stress, stress, stress, Qwen, llama 4]
...
...
@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
Test the apply_repetition_penalties custom op
against a reference implementation.
"""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cuda:0"
)
# Create test data
...
...
@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
dtype
=
torch
.
float32
seed
=
0
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cuda:0"
)
# Create test data
...
...
tests/kernels/test_fla_layernorm_guard.py
View file @
7e63ef82
...
...
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
layernorm_fn
,
rms_norm_ref
,
)
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
def
layer_norm_ref
(
...
...
@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic(
is_rms_norm
:
bool
,
)
->
None
:
"""Test basic layer norm forward pass without z (gate) tensor."""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
...
...
@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate(
is_rms_norm
:
bool
,
)
->
None
:
"""Test layer norm forward pass with z (gate) tensor."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
...
...
@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups(
f
"hidden_size
{
hidden_size
}
not divisible by group_size
{
group_size
}
"
)
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
...
...
@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block(
dtype
:
torch
.
dtype
,
)
->
None
:
"""Test that rows_per_block logic works correctly for various M sizes."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
hidden_size
=
1024
...
...
@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block(
def
test_strided_input
(
dtype
:
torch
.
dtype
)
->
None
:
"""Test that the kernel handles non-contiguous (strided)
inputs correctly."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
num_tokens
=
128
hidden_size
=
1024
...
...
@@ -318,7 +318,7 @@ def test_output_buffer_provided(
dtype
:
torch
.
dtype
,
)
->
None
:
"""Test that the kernel works when an output buffer is provided."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
...
...
@@ -359,7 +359,7 @@ def test_multidimensional_input(
dtype
:
torch
.
dtype
,
)
->
None
:
"""Test that the autograd function handles multidimensional inputs."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
hidden_size
=
shape
[
-
1
]
...
...
tests/kernels/test_flex_attention.py
View file @
7e63ef82
...
...
@@ -42,7 +42,7 @@ def set_seed(seed):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
...
...
@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
)
as
llm_flex
:
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
)
as
llm_flex
:
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
set_seed
(
seed
)
with
vllm_runner
(
model_name
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
output_flex
,
...
...
@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
...
...
@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
# Run with default backend
with
(
monkeypatch
.
context
()
as
m
,
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_default
,
):
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
torch
.
bfloat16
,
tensor_parallel_size
=
1
,
max_model_len
=
100
,
enforce_eager
=
True
,
)
as
llm_default
:
default_outputs
=
llm_default
.
embed
(
prompts
)
check_embeddings_close
(
...
...
tests/kernels/untest_fused_quant_activation.py
View file @
7e63ef82
...
...
@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_silu_and_mul
(
default_vllm_config
,
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
...
...
tests/kernels/utils.py
View file @
7e63ef82
...
...
@@ -13,11 +13,11 @@ import torch
from
torch._prims_common
import
TensorLikeType
from
tests.kernels.quant_utils
import
native_w8a8_block_matmul
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.utils.torch_utils
import
make_tensor_with_pad
from
vllm.v1.attention.backend
import
AttentionType
# For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4.
...
...
tests/lora/conftest.py
View file @
7e63ef82
...
...
@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@
pytest
.
fixture
def
dummy_model
()
->
nn
.
Module
:
def
dummy_model
(
default_vllm_config
)
->
nn
.
Module
:
model
=
DummyLoRAModel
(
OrderedDict
(
[
...
...
@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module:
@
pytest
.
fixture
def
dummy_model_gate_up
()
->
nn
.
Module
:
def
dummy_model_gate_up
(
default_vllm_config
)
->
nn
.
Module
:
model
=
DummyLoRAModel
(
OrderedDict
(
[
...
...
@@ -214,6 +214,31 @@ def qwen25vl_lora_files():
return
snapshot_download
(
repo_id
=
"jeeejeee/qwen25-vl-lora-pokemon"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen2vl_language_lora_files
():
return
snapshot_download
(
repo_id
=
"prashanth058/qwen2vl-flickr-lora-language"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen2vl_vision_tower_connector_lora_files
():
return
snapshot_download
(
repo_id
=
"prashanth058/qwen2vl-flickr-lora-tower-connector"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen2vl_vision_tower_lora_files
():
return
snapshot_download
(
repo_id
=
"prashanth058/qwen2vl-flickr-lora-tower"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen25vl_vision_lora_files
():
return
snapshot_download
(
repo_id
=
"EpochEcho/qwen2.5-3b-vl-lora-vision-connector"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen3vl_vision_lora_files
():
return
snapshot_download
(
repo_id
=
"EpochEcho/qwen3-4b-vl-lora-vision-connector"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
tinyllama_lora_files
():
# return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
...
...
tests/lora/test_fused_moe_lora_kernel.py
View file @
7e63ef82
...
...
@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_world_size
,
)
from
vllm.lora.ops.triton_ops
import
fused_moe_lora
from
vllm.platforms
import
current_platform
from
vllm.utils.network_utils
import
get_open_port
from
vllm.utils.torch_utils
import
set_random_seed
@
pytest
.
fixture
(
autouse
=
True
)
...
...
@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel(
seed
,
):
torch
.
set_default_device
(
device
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
# the number of randomly generated sentences.
num_sequences
=
10
# generate data
...
...
@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded(
seed
,
column_parallel
,
):
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
# the number of randomly generated sentences.
num_sequences
=
10
# generate data
...
...
@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
def
_get_shard_slice
(
shard_size
):
return
slice
(
local_rank
*
shard_size
,
(
local_rank
+
1
)
*
shard_size
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
...
...
tests/lora/test_gptoss_tp.py
View file @
7e63ef82
...
...
@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
###Response:<|end|><|start|>assistant<|channel|>final<|message|>"""
# noqa: E501
EXPECTED_LORA_OUTPUT
=
[
"SELECT
AVG
(Working_Horses) FROM farm WHERE Total_Horses
>
5000
;
"
,
"SELECT
MAX
(Cows)
AS Max_Cows, MIN(Cows) AS M
in
_
Cows FROM farm
;
"
,
"SELECT
MAX
(Cows)
AS Max_Cows, MIN(Cows) AS M
in
_
Cows FROM farm
;
"
,
"SELECT
avg
(Working_Horses) FROM farm WHERE Total_Horses
>
5000"
,
"SELECT
max
(Cows)
, m
in
(
Cows
)
FROM farm"
,
"SELECT
max
(Cows)
, m
in
(
Cows
)
FROM farm"
,
]
...
...
@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
assert
generated_texts
[
i
].
startswith
(
EXPECTED_LORA_OUTPUT
[
i
])
def
test_gpt_oss_lora
(
gptoss20b_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
8
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
@
pytest
.
mark
.
parametrize
(
"mxfp4_use_marlin"
,
[
True
,
False
])
def
test_gpt_oss_lora
(
monkeypatch
:
pytest
.
MonkeyPatch
,
gptoss20b_lora_files
,
mxfp4_use_marlin
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_MXFP4_USE_MARLIN"
,
"1"
if
mxfp4_use_marlin
else
"0"
)
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
8
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
2048
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"fully_sharded_loras"
,
[
False
,
True
])
def
test_gpt_oss_lora_tp2
(
gptoss20b_lora_files
,
fully_sharded_loras
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
enable_lora
=
True
,
max_loras
=
2
,
max_lora_rank
=
8
,
max_num_seqs
=
16
,
tensor_parallel_size
=
2
,
fully_sharded_loras
=
fully_sharded_loras
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
@
pytest
.
mark
.
parametrize
(
"mxfp4_use_marlin"
,
[
True
,
False
])
def
test_gpt_oss_lora_tp2
(
monkeypatch
:
pytest
.
MonkeyPatch
,
gptoss20b_lora_files
,
fully_sharded_loras
,
mxfp4_use_marlin
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_MXFP4_USE_MARLIN"
,
"1"
if
mxfp4_use_marlin
else
"0"
)
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
enable_lora
=
True
,
max_loras
=
2
,
max_num_seqs
=
2
,
max_num_batched_tokens
=
2048
,
tensor_parallel_size
=
2
,
gpu_memory_utilization
=
0.8
,
fully_sharded_loras
=
fully_sharded_loras
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
tests/lora/test_layers.py
View file @
7e63ef82
...
...
@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding
,
get_masked_input_and_mask
,
)
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
from
.utils
import
DummyLoRAManager
...
...
@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_embeddings
(
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
def
test_embeddings
(
default_vllm_config
,
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925
# Same below.
...
...
@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
torch
.
set_default_device
(
device
)
max_loras
=
8
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_random_embedding_layer
():
embedding
=
VocabParallelEmbedding
(
vocab_size
,
256
)
...
...
@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
256512
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_lm_head_logits_processor
(
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
default_vllm_config
,
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
_pretest
():
linear
=
ParallelLMHead
(
...
...
@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_linear_replicated
(
default_vllm_config
,
dist_init
,
num_loras
,
device
,
...
...
@@ -480,13 +483,13 @@ def test_linear_replicated(
max_loras
=
8
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
,
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_random_linear_replicated_layer
():
linear
=
ReplicatedLinear
(
4096
,
4096
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
...
...
@@ -580,21 +583,21 @@ def test_linear_replicated(
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_linear_parallel
(
dist_init
,
num_loras
,
orientation
,
fully_shard
,
device
,
stage
default_vllm_config
,
dist_init
,
num_loras
,
orientation
,
fully_shard
,
device
,
stage
)
->
None
:
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
max_loras
=
8
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
fully_sharded_loras
=
fully_shard
,
lora_dtype
=
torch
.
float16
,
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_random_linear_parallel_layer
():
if
orientation
==
"row"
:
...
...
@@ -705,21 +708,21 @@ def test_linear_parallel(
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_column_parallel_packed
(
dist_init
,
num_loras
,
repeats
,
fully_shard
,
device
,
stage
default_vllm_config
,
dist_init
,
num_loras
,
repeats
,
fully_shard
,
device
,
stage
)
->
None
:
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
max_loras
=
8
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
fully_sharded_loras
=
fully_shard
,
lora_dtype
=
torch
.
float16
,
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_column_parallel_packed_layer
():
if
repeats
==
2
:
...
...
@@ -851,7 +854,7 @@ def test_column_parallel_packed(
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
))
)
def
test_vocab_parallel_embedding_indices
(
tp_size
,
seed
):
def
test_vocab_parallel_embedding_indices
(
tp_size
,
seed
,
default_vllm_config
):
random
.
seed
(
seed
)
vocab_size
=
random
.
randint
(
4000
,
64000
)
added_vocab_size
=
random
.
randint
(
0
,
1024
)
...
...
tests/lora/test_llama_tp.py
View file @
7e63ef82
...
...
@@ -77,11 +77,18 @@ def do_sample(
if
lora_id
else
None
,
)
# Print the outputs.
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
generated_texts
:
list
[
str
]
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
# The output should include correct lora_request info
if
lora_request
is
not
None
:
assert
output
.
lora_request
.
lora_name
==
lora_request
.
lora_name
assert
output
.
lora_request
.
lora_int_id
==
lora_request
.
lora_int_id
assert
output
.
lora_request
.
lora_path
==
lora_request
.
lora_path
else
:
assert
output
.
lora_request
is
None
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
...
...
tests/lora/test_lora_manager.py
View file @
7e63ef82
...
...
@@ -18,6 +18,7 @@ from vllm.lora.layers import (
from
vllm.lora.lora_model
import
LoRAModel
from
vllm.lora.lora_weights
import
LoRALayerWeights
,
PackedLoRALayerWeights
from
vllm.lora.model_manager
import
(
DEFAULT_LANGUAGE_WRAPPER_KEY
,
LoRAMapping
,
LoRAModelManager
,
LRUCacheLoRAModelManager
,
...
...
@@ -110,7 +111,7 @@ def create_packed_lora(
return
LoRAModel
(
lora_id
,
8
,
loras
)
def
test_replace_submodules
(
dist_init
,
dummy_model
):
def
test_replace_submodules
(
default_vllm_config
,
dist_init
,
dummy_model
):
model
=
dummy_model
manager
=
LoRAModelManager
(
model
,
...
...
@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model):
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lora_model_manager
(
dist_init
,
dummy_model
,
device
):
def
test_lora_model_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
model
=
dummy_model
model_lora1
=
create_lora
(
1
,
model
,
[
"layer1.dense1"
,
"dense2"
,
"lm_head"
],
device
=
device
...
...
@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert
manager
.
activate_adapter
(
2
)
assert
manager
.
lora_index_to_id
[
0
]
==
3
assert
manager
.
lora_index_to_id
[
1
]
==
2
assert
manager
.
device
==
device
assert
manager
.
punica_wrapper
.
device
==
device
assert
(
manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
).
device
==
device
)
assert
hasattr
(
manager
,
"supported_lora_modules"
)
assert
sorted
(
manager
.
supported_lora_modules
)
==
[
"dense1"
,
...
...
@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device):
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lora_lru_cache_model_manager
(
dist_init
,
dummy_model
,
device
):
def
test_lora_lru_cache_model_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
model
=
dummy_model
model_lora1
=
create_lora
(
1
,
model
,
[
"layer1.dense1"
,
"dense2"
,
"lm_head"
],
device
=
device
...
...
@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert
manager
.
remove_adapter
(
3
)
with
pytest
.
raises
(
ValueError
):
assert
manager
.
pin_adapter
(
3
)
assert
manager
.
punica_wrapper
.
device
==
device
assert
(
manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
).
device
==
device
)
assert
manager
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lru_lora_model_manager
(
dist_init
,
dummy_model
,
device
):
def
test_lru_lora_model_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
# This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager
model
=
dummy_model
...
...
@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert
manager
.
remove_oldest_adapter
()
assert
set
(
manager
.
list_adapters
())
==
{
1
}
assert
manager
.
punica_wrapper
.
device
==
device
assert
(
manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
).
device
==
device
)
assert
manager
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lru_cache_worker_adapter_manager
(
dist_init
,
dummy_model
,
device
,
tmp_path
):
def
test_lru_cache_worker_adapter_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
,
tmp_path
):
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
)
...
...
@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
)
assert
worker_adapter_manager
.
device
==
device
assert
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper
.
device
==
device
punica_wrapper
=
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert
punica_wrapper
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_worker_adapter_manager
(
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
def
test_worker_adapter_manager
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
# Should remove every LoRA not specified in the request.
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
...
...
@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
)
assert
worker_adapter_manager
.
device
==
device
assert
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper
.
device
==
device
punica_wrapper
=
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert
punica_wrapper
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_packed_loras
(
dist_init
,
dummy_model_gate_up
,
device
):
def
test_packed_loras
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
):
model
=
dummy_model_gate_up
model_lora
=
create_packed_lora
(
1
,
...
...
Prev
1
…
21
22
23
24
25
26
27
28
29
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment