Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
377 additions
and
151 deletions
+377
-151
tests/kernels/quantization/test_int8_quant.py
tests/kernels/quantization/test_int8_quant.py
+5
-5
tests/kernels/quantization/test_mxfp4_qutlass.py
tests/kernels/quantization/test_mxfp4_qutlass.py
+2
-1
tests/kernels/quantization/test_nvfp4_qutlass.py
tests/kernels/quantization/test_nvfp4_qutlass.py
+2
-1
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+3
-1
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+3
-1
tests/kernels/quantization/untest_block_fp8.py
tests/kernels/quantization/untest_block_fp8.py
+51
-0
tests/kernels/quantization/untest_fp8_quant.py
tests/kernels/quantization/untest_fp8_quant.py
+107
-6
tests/kernels/quantization/untest_nvfp4_quant.py
tests/kernels/quantization/untest_nvfp4_quant.py
+3
-2
tests/kernels/quantization/untest_nvfp4_scaled_mm.py
tests/kernels/quantization/untest_nvfp4_scaled_mm.py
+2
-1
tests/kernels/test_apply_repetition_penalties.py
tests/kernels/test_apply_repetition_penalties.py
+3
-2
tests/kernels/test_fla_layernorm_guard.py
tests/kernels/test_fla_layernorm_guard.py
+8
-8
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+44
-51
tests/kernels/untest_fused_quant_activation.py
tests/kernels/untest_fused_quant_activation.py
+1
-0
tests/kernels/utils.py
tests/kernels/utils.py
+1
-1
tests/lora/conftest.py
tests/lora/conftest.py
+27
-2
tests/lora/test_fused_moe_lora_kernel.py
tests/lora/test_fused_moe_lora_kernel.py
+4
-4
tests/lora/test_gptoss_tp.py
tests/lora/test_gptoss_tp.py
+50
-34
tests/lora/test_layers.py
tests/lora/test_layers.py
+19
-16
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+8
-1
tests/lora/test_lora_manager.py
tests/lora/test_lora_manager.py
+34
-14
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/kernels/quantization/test_int8_quant.py
View file @
7e63ef82
...
@@ -7,7 +7,7 @@ import torch
...
@@ -7,7 +7,7 @@ import torch
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm._custom_ops
import
scaled_int8_quant
from
vllm._custom_ops
import
scaled_int8_quant
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
...
@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
...
@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def
test_dynamic_scaled_int8_quant
(
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant(
...
@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant(
def
test_dynamic_scaled_int8_azp_quant
(
def
test_dynamic_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
...
@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant(
...
@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant(
def
test_static_scaled_int8_quant
(
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant(
...
@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant(
scale
:
float
,
scale
:
float
,
azp
:
int
,
azp
:
int
,
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
-
300
...
...
tests/kernels/quantization/test_mxfp4_qutlass.py
View file @
7e63ef82
...
@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
...
@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from
vllm._custom_ops
import
fusedQuantizeMx
,
matmul_mxf4_bf16_tn
from
vllm._custom_ops
import
fusedQuantizeMx
,
matmul_mxf4_bf16_tn
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
torch
.
cuda
.
is_available
():
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
...
@@ -205,7 +206,7 @@ LLAMA_MODELS = {
...
@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
_seed_each_test
():
def
_seed_each_test
():
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
np
.
random
.
seed
(
0
)
np
.
random
.
seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
...
...
tests/kernels/quantization/test_nvfp4_qutlass.py
View file @
7e63ef82
...
@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
...
@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from
vllm._custom_ops
import
fusedQuantizeNv
from
vllm._custom_ops
import
fusedQuantizeNv
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.model_executor.layers.quantization.qutlass_utils
import
to_blocked
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
torch
.
cuda
.
is_available
():
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
pytest
.
skip
(
"CUDA required for these tests."
,
allow_module_level
=
True
)
...
@@ -193,7 +194,7 @@ LLAMA_MODELS = {
...
@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
_seed_each_test
():
def
_seed_each_test
():
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
np
.
random
.
seed
(
0
)
np
.
random
.
seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
torch
.
random
.
manual_seed
(
0
)
...
...
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
View file @
7e63ef82
...
@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
...
@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from
vllm._custom_ops
import
scaled_fp4_quant
from
vllm._custom_ops
import
scaled_fp4_quant
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
pytest
.
skip
(
...
@@ -30,10 +31,11 @@ BLOCK_SIZE = 16
...
@@ -30,10 +31,11 @@ BLOCK_SIZE = 16
@
pytest
.
mark
.
parametrize
(
"shape"
,
SHAPES
)
@
pytest
.
mark
.
parametrize
(
"shape"
,
SHAPES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_silu_mul_nvfp4_quant
(
def
test_silu_mul_nvfp4_quant
(
default_vllm_config
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
int
],
shape
:
tuple
[
int
,
int
],
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
"cuda:0"
device
=
"cuda:0"
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
...
...
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
7e63ef82
...
@@ -11,7 +11,9 @@ import pytest
...
@@ -11,7 +11,9 @@ import pytest
import
torch
import
torch
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
from
vllm.utils.torch_utils
import
set_random_seed
device
=
"cuda"
device
=
"cuda"
...
@@ -86,7 +88,7 @@ def test_scaled_mm(
...
@@ -86,7 +88,7 @@ def test_scaled_mm(
):
):
is_floating_point_type
=
lambda
t
:
torch
.
tensor
([
1
,
1
],
dtype
=
t
).
is_floating_point
()
is_floating_point_type
=
lambda
t
:
torch
.
tensor
([
1
,
1
],
dtype
=
t
).
is_floating_point
()
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
# NOTE: There are cases, where if the matrix is large enough, an output
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when
# like 65504.4 can be produced, and can easily turn into inf when
...
...
tests/kernels/quantization/untest_block_fp8.py
View file @
7e63ef82
...
@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import (
...
@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import (
per_block_cast_to_fp8
,
per_block_cast_to_fp8
,
should_use_deepgemm_for_fp8_linear
,
should_use_deepgemm_for_fp8_linear
,
)
)
from
vllm.utils.flashinfer
import
(
flashinfer_fp8_blockscale_gemm
,
has_flashinfer_fp8_blockscale_gemm
,
)
from
vllm.utils.import_utils
import
has_deep_gemm
from
vllm.utils.import_utils
import
has_deep_gemm
if
current_platform
.
get_device_capability
()
<
(
9
,
0
):
if
current_platform
.
get_device_capability
()
<
(
9
,
0
):
...
@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
...
@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
))
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
))
)
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
)))
)
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
)))
assert
rel_diff
<
0.001
assert
rel_diff
<
0.001
@
pytest
.
mark
.
skipif
(
current_platform
.
is_fp8_fnuz
(),
reason
=
"This platform supports e4m3fnuz, not e4m3fn."
,
)
@
pytest
.
mark
.
parametrize
(
"M,N,K,block_size,out_dtype,seed"
,
itertools
.
product
(
M
,
N
,
K
,
BLOCK_SIZE
,
OUT_DTYPES
,
SEEDS
),
)
@
torch
.
inference_mode
()
def
test_w8a8_block_fp8_flashinfer_matmul
(
M
,
N
,
K
,
block_size
,
out_dtype
,
seed
):
if
not
has_flashinfer_fp8_blockscale_gemm
():
pytest
.
skip
(
"FlashInfer block GEMM not available (requires SM90+ and FlashInfer)"
)
# only aligned sizes
if
K
%
128
!=
0
or
N
%
64
!=
0
:
pytest
.
skip
(
f
"Skipping test; invalid size
{
M
}
,
{
N
}
,
{
K
}
"
)
torch
.
manual_seed
(
seed
)
fp8_info
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
=
fp8_info
.
max
A_bf16
=
(
torch
.
rand
(
M
,
K
,
dtype
=
torch
.
bfloat16
)
-
0.5
)
*
2
*
fp8_max
B_bf16
=
(
torch
.
rand
(
N
,
K
,
dtype
=
torch
.
bfloat16
)
-
0.5
)
*
2
*
fp8_max
A_fp8
,
As_fp8
=
per_token_group_quant_fp8
(
A_bf16
,
block_size
[
1
],
use_ue8m0
=
False
)
B_fp8
,
Bs_fp8
=
per_block_cast_to_fp8
(
B_bf16
,
block_size
,
use_ue8m0
=
False
)
As
=
As_fp8
.
to
(
torch
.
float32
)
Bs
=
Bs_fp8
.
to
(
torch
.
float32
)
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
out
=
flashinfer_fp8_blockscale_gemm
(
input
=
A_bf16
,
weight
=
B_fp8
,
input_scale
=
None
,
weight_scale
=
Bs
,
out_dtype
=
out_dtype
,
)
rel_diff
=
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
bfloat16
)
-
ref_out
.
to
(
torch
.
bfloat16
))
)
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
bfloat16
)))
assert
rel_diff
<
0.001
tests/kernels/quantization/untest_fp8_quant.py
View file @
7e63ef82
...
@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import (
...
@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant
,
ref_dynamic_per_token_quant
,
)
)
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
scaled_quantize
,
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
HIDDEN_SIZES
=
[
17
,
1024
,
1025
,
1026
,
5137
,
8193
]
...
@@ -21,10 +25,18 @@ SEEDS = [0]
...
@@ -21,10 +25,18 @@ SEEDS = [0]
def
opcheck_fp8_quant
(
def
opcheck_fp8_quant
(
output
,
input
,
scale
=
None
,
scale_ub
=
None
,
use_per_token_if_dynamic
=
False
output
,
input
,
scale
=
None
,
scale_ub
=
None
,
use_per_token_if_dynamic
=
False
,
group_shape
=
None
,
):
):
if
scale
is
not
None
:
if
scale
is
not
None
:
opcheck
(
torch
.
ops
.
_C
.
static_scaled_fp8_quant
,
(
output
,
input
,
scale
))
opcheck
(
torch
.
ops
.
_C
.
static_scaled_fp8_quant
,
(
output
,
input
,
scale
,
group_shape
),
)
elif
use_per_token_if_dynamic
:
elif
use_per_token_if_dynamic
:
scale
=
torch
.
empty
(
scale
=
torch
.
empty
(
(
input
.
shape
[
0
],
1
),
device
=
input
.
device
,
dtype
=
torch
.
float32
(
input
.
shape
[
0
],
1
),
device
=
input
.
device
,
dtype
=
torch
.
float32
...
@@ -51,7 +63,7 @@ def opcheck_fp8_quant(
...
@@ -51,7 +63,7 @@ def opcheck_fp8_quant(
def
test_dynamic_per_token_fp8_quant
(
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
(
x
=
(
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
...
@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant(
...
@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant(
def
test_dynamic_per_tensor_fp8_quant
(
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
...
@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant(
...
@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant(
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
...
@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None:
...
@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None:
ref_out
=
ref_out
.
to
(
dtype
=
dtype
)
ref_out
=
ref_out
.
to
(
dtype
=
dtype
)
ops_out
=
ops_out
.
to
(
dtype
=
dtype
)
ops_out
=
ops_out
.
to
(
dtype
=
dtype
)
torch
.
testing
.
assert_close
(
ref_out
,
ops_out
)
torch
.
testing
.
assert_close
(
ref_out
,
ops_out
)
\ No newline at end of file
# Test static FP8 quantization with 2D group scales
GROUP_SHAPES_2D
=
[
(
-
1
,
-
1
),
# Per-tensor
(
-
1
,
1
),
# Per-channel
(
1
,
-
1
),
# Per-token
(
-
1
,
128
),
# Per-head quantization
(
1
,
128
),
# DeepSeek-style per-token-per-group (group_m=1, group_n=128)
(
128
,
128
),
# DeepSeek-style block quantization
(
1
,
64
),
# Smaller group size
(
1
,
16
),
# Small group (scalar path in kernel)
(
4
,
256
),
# Non-trivial both dimensions
]
# Use sizes divisible by all group shapes
NUM_TOKENS_GROUP
=
[
128
,
512
]
HIDDEN_SIZES_GROUP
=
[
256
,
1024
,
2048
]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS_GROUP
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES_GROUP
)
@
pytest
.
mark
.
parametrize
(
"group_shape"
,
GROUP_SHAPES_2D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_static_fp8_quant_group_2d
(
num_tokens
:
int
,
hidden_size
:
int
,
group_shape
:
tuple
[
int
,
int
],
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
"""Test static FP8 quantization with 2D group scales using scaled_quantize."""
# Normalize group_shape (-1 means full extent)
norm_group_m
=
num_tokens
if
group_shape
[
0
]
==
-
1
else
group_shape
[
0
]
norm_group_n
=
hidden_size
if
group_shape
[
1
]
==
-
1
else
group_shape
[
1
]
# Skip if sizes are not divisible by group shape
if
num_tokens
%
norm_group_m
!=
0
or
hidden_size
%
norm_group_n
!=
0
:
pytest
.
skip
(
f
"Skipping: (
{
num_tokens
}
,
{
hidden_size
}
) not divisible by "
f
"group_shape (
{
group_shape
[
0
]
}
,
{
group_shape
[
1
]
}
)"
)
current_platform
.
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale
=
scaled_quantize
(
x
,
group_shape
,
FP8_DTYPE
,
compute_dtype
=
torch
.
float32
)
ops_out
,
ops_scale
=
ops
.
scaled_fp8_quant
(
x
,
scale
=
scale
,
group_shape
=
group_shape
)
torch
.
testing
.
assert_close
(
scale
,
ops_scale
)
torch
.
testing
.
assert_close
(
ref_out
.
float
(),
ops_out
.
float
(),
rtol
=
0.12
,
atol
=
0.0
)
opcheck_fp8_quant
(
ops_out
,
x
,
scale
=
scale
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS_GROUP
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES_GROUP
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"group_shape"
,
[(
1
,
-
1
),
(
-
1
,
1
)])
# per-token, per-channel
@
torch
.
inference_mode
()
def
test_static_fp8_quant_1d_scale
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
group_shape
:
tuple
[
int
,
int
],
)
->
None
:
"""Test static FP8 quantization with 1D scale (per-token or per-channel)."""
current_platform
.
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale_2d
=
scaled_quantize
(
x
,
group_shape
,
FP8_DTYPE
,
compute_dtype
=
torch
.
float32
)
# Flatten scale to 1D for testing 1D scale path
scale_1d
=
scale_2d
.
flatten
()
ops_out
,
ops_scale
=
ops
.
scaled_fp8_quant
(
x
,
scale
=
scale_1d
,
group_shape
=
group_shape
)
torch
.
testing
.
assert_close
(
scale_1d
,
ops_scale
)
torch
.
testing
.
assert_close
(
ref_out
.
float
(),
ops_out
.
float
(),
rtol
=
0.12
,
atol
=
0.0
)
opcheck_fp8_quant
(
ops_out
,
x
,
scale
=
scale_1d
,
group_shape
=
group_shape
)
tests/kernels/quantization/untest_nvfp4_quant.py
View file @
7e63ef82
...
@@ -6,6 +6,7 @@ import torch
...
@@ -6,6 +6,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.scalar_type
import
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
pytest
.
skip
(
...
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
...
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
m
,
n
=
shape
m
,
n
=
shape
...
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
...
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_quantize_to_fp4_padded
(
pad_shape
:
tuple
[
int
,
int
])
->
None
:
def
test_quantize_to_fp4_padded
(
pad_shape
:
tuple
[
int
,
int
])
->
None
:
dtype
=
torch
.
float16
dtype
=
torch
.
float16
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
torch
.
set_default_device
(
"cuda:0"
)
torch
.
set_default_device
(
"cuda:0"
)
m
,
n
=
pad_shape
m
,
n
=
pad_shape
...
...
tests/kernels/quantization/untest_nvfp4_scaled_mm.py
View file @
7e63ef82
...
@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
...
@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
has_device_capability
(
100
):
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
pytest
.
skip
(
...
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
...
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
m
,
n
,
packed_k
=
shape
m
,
n
,
packed_k
=
shape
k
=
packed_k
*
2
k
=
packed_k
*
2
block_size
=
16
block_size
=
16
...
...
tests/kernels/test_apply_repetition_penalties.py
View file @
7e63ef82
...
@@ -9,6 +9,7 @@ from vllm._custom_ops import (
...
@@ -9,6 +9,7 @@ from vllm._custom_ops import (
apply_repetition_penalties_torch
,
apply_repetition_penalties_torch
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
NUM_SEQS
=
[
1
,
2
,
3
,
4
,
8
,
13
,
17
,
32
,
37
,
256
,
1023
,
1024
,
1025
]
NUM_SEQS
=
[
1
,
2
,
3
,
4
,
8
,
13
,
17
,
32
,
37
,
256
,
1023
,
1024
,
1025
]
# [stress, stress, stress, Qwen, llama 4]
# [stress, stress, stress, Qwen, llama 4]
...
@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
...
@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
Test the apply_repetition_penalties custom op
Test the apply_repetition_penalties custom op
against a reference implementation.
against a reference implementation.
"""
"""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cuda:0"
)
torch
.
set_default_device
(
"cuda:0"
)
# Create test data
# Create test data
...
@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
...
@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
dtype
=
torch
.
float32
dtype
=
torch
.
float32
seed
=
0
seed
=
0
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cuda:0"
)
torch
.
set_default_device
(
"cuda:0"
)
# Create test data
# Create test data
...
...
tests/kernels/test_fla_layernorm_guard.py
View file @
7e63ef82
...
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
...
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
layernorm_fn
,
layernorm_fn
,
rms_norm_ref
,
rms_norm_ref
,
)
)
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
def
layer_norm_ref
(
def
layer_norm_ref
(
...
@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic(
...
@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic(
is_rms_norm
:
bool
,
is_rms_norm
:
bool
,
)
->
None
:
)
->
None
:
"""Test basic layer norm forward pass without z (gate) tensor."""
"""Test basic layer norm forward pass without z (gate) tensor."""
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
# Create inputs
...
@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate(
...
@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate(
is_rms_norm
:
bool
,
is_rms_norm
:
bool
,
)
->
None
:
)
->
None
:
"""Test layer norm forward pass with z (gate) tensor."""
"""Test layer norm forward pass with z (gate) tensor."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
# Create inputs
...
@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups(
...
@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups(
f
"hidden_size
{
hidden_size
}
not divisible by group_size
{
group_size
}
"
f
"hidden_size
{
hidden_size
}
not divisible by group_size
{
group_size
}
"
)
)
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
# Create inputs
...
@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block(
...
@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block(
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
)
->
None
:
)
->
None
:
"""Test that rows_per_block logic works correctly for various M sizes."""
"""Test that rows_per_block logic works correctly for various M sizes."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
hidden_size
=
1024
hidden_size
=
1024
...
@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block(
...
@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block(
def
test_strided_input
(
dtype
:
torch
.
dtype
)
->
None
:
def
test_strided_input
(
dtype
:
torch
.
dtype
)
->
None
:
"""Test that the kernel handles non-contiguous (strided)
"""Test that the kernel handles non-contiguous (strided)
inputs correctly."""
inputs correctly."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
num_tokens
=
128
num_tokens
=
128
hidden_size
=
1024
hidden_size
=
1024
...
@@ -318,7 +318,7 @@ def test_output_buffer_provided(
...
@@ -318,7 +318,7 @@ def test_output_buffer_provided(
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
)
->
None
:
)
->
None
:
"""Test that the kernel works when an output buffer is provided."""
"""Test that the kernel works when an output buffer is provided."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
# Create inputs
# Create inputs
...
@@ -359,7 +359,7 @@ def test_multidimensional_input(
...
@@ -359,7 +359,7 @@ def test_multidimensional_input(
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
)
->
None
:
)
->
None
:
"""Test that the autograd function handles multidimensional inputs."""
"""Test that the autograd function handles multidimensional inputs."""
current_platform
.
seed_everything
(
42
)
set_random_seed
(
42
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
hidden_size
=
shape
[
-
1
]
hidden_size
=
shape
[
-
1
]
...
...
tests/kernels/test_flex_attention.py
View file @
7e63ef82
...
@@ -42,7 +42,7 @@ def set_seed(seed):
...
@@ -42,7 +42,7 @@ def set_seed(seed):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
)
def
test_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
This test compares the outputs from the FlexAttention backend with
...
@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
]
# Run with flex attention
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
set_seed
(
seed
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
with
vllm_runner
(
model_name
,
set_seed
(
seed
)
runner
=
"generate"
,
with
vllm_runner
(
tensor_parallel_size
=
1
,
model_name
,
num_gpu_blocks_override
=
128
,
runner
=
"generate"
,
enforce_eager
=
True
,
tensor_parallel_size
=
1
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
num_gpu_blocks_override
=
128
,
)
as
llm_flex
:
enforce_eager
=
True
,
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
)
as
llm_flex
:
prompts
,
max_tokens
,
num_logprobs
output_flex
=
llm_flex
.
generate_greedy_logprobs
(
)
prompts
,
max_tokens
,
num_logprobs
)
# Run with default backend
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
set_seed
(
seed
)
set_seed
(
seed
)
with
vllm_runner
(
with
vllm_runner
(
model_name
,
model_name
,
runner
=
"generate"
,
runner
=
"generate"
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.85
,
gpu_memory_utilization
=
0.85
,
)
as
llm_default
:
)
as
llm_default
:
output_default
=
llm_default
.
generate_greedy_logprobs
(
output_default
=
llm_default
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
prompts
,
max_tokens
,
num_logprobs
)
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
output_flex
,
outputs_0_lst
=
output_flex
,
...
@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
)
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
,
monkeypatch
):
def
test_encoder_flex_attention_vs_default_backend
(
vllm_runner
):
"""Test that FlexAttention produces the same outputs as the default backend.
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
This test compares the outputs from the FlexAttention backend with
...
@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
...
@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
]
# Run with flex attention
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
with
vllm_runner
(
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
model_name
,
with
vllm_runner
(
runner
=
"pooling"
,
model_name
,
dtype
=
torch
.
bfloat16
,
runner
=
"pooling"
,
tensor_parallel_size
=
1
,
dtype
=
torch
.
bfloat16
,
max_model_len
=
100
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
,
max_model_len
=
100
,
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
},
enforce_eager
=
True
,
)
as
llm_flex
:
)
as
llm_flex
:
flex_outputs
=
llm_flex
.
embed
(
prompts
)
flex_outputs
=
llm_flex
.
embed
(
prompts
)
# Run with default backend
# Run with default backend
with
(
with
vllm_runner
(
monkeypatch
.
context
()
as
m
,
model_name
,
vllm_runner
(
runner
=
"pooling"
,
model_name
,
dtype
=
torch
.
bfloat16
,
runner
=
"pooling"
,
tensor_parallel_size
=
1
,
dtype
=
torch
.
bfloat16
,
max_model_len
=
100
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
,
max_model_len
=
100
,
)
as
llm_default
:
enforce_eager
=
True
,
)
as
llm_default
,
):
default_outputs
=
llm_default
.
embed
(
prompts
)
default_outputs
=
llm_default
.
embed
(
prompts
)
check_embeddings_close
(
check_embeddings_close
(
...
...
tests/kernels/untest_fused_quant_activation.py
View file @
7e63ef82
...
@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
...
@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_silu_and_mul
(
def
test_silu_and_mul
(
default_vllm_config
,
num_tokens
:
int
,
num_tokens
:
int
,
hidden_size
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
...
...
tests/kernels/utils.py
View file @
7e63ef82
...
@@ -13,11 +13,11 @@ import torch
...
@@ -13,11 +13,11 @@ import torch
from
torch._prims_common
import
TensorLikeType
from
torch._prims_common
import
TensorLikeType
from
tests.kernels.quant_utils
import
native_w8a8_block_matmul
from
tests.kernels.quant_utils
import
native_w8a8_block_matmul
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.utils.torch_utils
import
make_tensor_with_pad
from
vllm.utils.torch_utils
import
make_tensor_with_pad
from
vllm.v1.attention.backend
import
AttentionType
# For now, disable "test_aot_dispatch_dynamic" since there are some
# For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4.
# bugs related to this test in PyTorch 2.4.
...
...
tests/lora/conftest.py
View file @
7e63ef82
...
@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
...
@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@
pytest
.
fixture
@
pytest
.
fixture
def
dummy_model
()
->
nn
.
Module
:
def
dummy_model
(
default_vllm_config
)
->
nn
.
Module
:
model
=
DummyLoRAModel
(
model
=
DummyLoRAModel
(
OrderedDict
(
OrderedDict
(
[
[
...
@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module:
...
@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module:
@
pytest
.
fixture
@
pytest
.
fixture
def
dummy_model_gate_up
()
->
nn
.
Module
:
def
dummy_model_gate_up
(
default_vllm_config
)
->
nn
.
Module
:
model
=
DummyLoRAModel
(
model
=
DummyLoRAModel
(
OrderedDict
(
OrderedDict
(
[
[
...
@@ -214,6 +214,31 @@ def qwen25vl_lora_files():
...
@@ -214,6 +214,31 @@ def qwen25vl_lora_files():
return
snapshot_download
(
repo_id
=
"jeeejeee/qwen25-vl-lora-pokemon"
)
return
snapshot_download
(
repo_id
=
"jeeejeee/qwen25-vl-lora-pokemon"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen2vl_language_lora_files
():
return
snapshot_download
(
repo_id
=
"prashanth058/qwen2vl-flickr-lora-language"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen2vl_vision_tower_connector_lora_files
():
return
snapshot_download
(
repo_id
=
"prashanth058/qwen2vl-flickr-lora-tower-connector"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen2vl_vision_tower_lora_files
():
return
snapshot_download
(
repo_id
=
"prashanth058/qwen2vl-flickr-lora-tower"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen25vl_vision_lora_files
():
return
snapshot_download
(
repo_id
=
"EpochEcho/qwen2.5-3b-vl-lora-vision-connector"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen3vl_vision_lora_files
():
return
snapshot_download
(
repo_id
=
"EpochEcho/qwen3-4b-vl-lora-vision-connector"
)
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
tinyllama_lora_files
():
def
tinyllama_lora_files
():
# return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
# return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
...
...
tests/lora/test_fused_moe_lora_kernel.py
View file @
7e63ef82
...
@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import (
...
@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
)
)
from
vllm.lora.ops.triton_ops
import
fused_moe_lora
from
vllm.lora.ops.triton_ops
import
fused_moe_lora
from
vllm.platforms
import
current_platform
from
vllm.utils.network_utils
import
get_open_port
from
vllm.utils.network_utils
import
get_open_port
from
vllm.utils.torch_utils
import
set_random_seed
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel(
...
@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel(
seed
,
seed
,
):
):
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
# the number of randomly generated sentences.
# the number of randomly generated sentences.
num_sequences
=
10
num_sequences
=
10
# generate data
# generate data
...
@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded(
...
@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded(
seed
,
seed
,
column_parallel
,
column_parallel
,
):
):
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
# the number of randomly generated sentences.
# the number of randomly generated sentences.
num_sequences
=
10
num_sequences
=
10
# generate data
# generate data
...
@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
...
@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
def
_get_shard_slice
(
shard_size
):
def
_get_shard_slice
(
shard_size
):
return
slice
(
local_rank
*
shard_size
,
(
local_rank
+
1
)
*
shard_size
)
return
slice
(
local_rank
*
shard_size
,
(
local_rank
+
1
)
*
shard_size
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
...
...
tests/lora/test_gptoss_tp.py
View file @
7e63ef82
...
@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
...
@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
###Response:<|end|><|start|>assistant<|channel|>final<|message|>"""
# noqa: E501
###Response:<|end|><|start|>assistant<|channel|>final<|message|>"""
# noqa: E501
EXPECTED_LORA_OUTPUT
=
[
EXPECTED_LORA_OUTPUT
=
[
"SELECT
AVG
(Working_Horses) FROM farm WHERE Total_Horses
>
5000
;
"
,
"SELECT
avg
(Working_Horses) FROM farm WHERE Total_Horses
>
5000"
,
"SELECT
MAX
(Cows)
AS Max_Cows, MIN(Cows) AS M
in
_
Cows FROM farm
;
"
,
"SELECT
max
(Cows)
, m
in
(
Cows
)
FROM farm"
,
"SELECT
MAX
(Cows)
AS Max_Cows, MIN(Cows) AS M
in
_
Cows FROM farm
;
"
,
"SELECT
max
(Cows)
, m
in
(
Cows
)
FROM farm"
,
]
]
...
@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
...
@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
assert
generated_texts
[
i
].
startswith
(
EXPECTED_LORA_OUTPUT
[
i
])
assert
generated_texts
[
i
].
startswith
(
EXPECTED_LORA_OUTPUT
[
i
])
def
test_gpt_oss_lora
(
gptoss20b_lora_files
):
@
pytest
.
mark
.
parametrize
(
"mxfp4_use_marlin"
,
[
True
,
False
])
llm
=
vllm
.
LLM
(
def
test_gpt_oss_lora
(
MODEL_PATH
,
monkeypatch
:
pytest
.
MonkeyPatch
,
gptoss20b_lora_files
,
mxfp4_use_marlin
max_model_len
=
1024
,
):
enable_lora
=
True
,
with
monkeypatch
.
context
()
as
m
:
max_loras
=
4
,
m
.
setenv
(
"VLLM_MXFP4_USE_MARLIN"
,
"1"
if
mxfp4_use_marlin
else
"0"
)
max_lora_rank
=
8
,
llm
=
vllm
.
LLM
(
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
MODEL_PATH
,
cudagraph_specialize_lora
=
False
,
max_model_len
=
1024
,
),
enable_lora
=
True
,
)
max_loras
=
4
,
max_lora_rank
=
8
,
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
max_num_seqs
=
2
,
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
max_num_batched_tokens
=
2048
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"fully_sharded_loras"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"fully_sharded_loras"
,
[
False
,
True
])
def
test_gpt_oss_lora_tp2
(
gptoss20b_lora_files
,
fully_sharded_loras
):
@
pytest
.
mark
.
parametrize
(
"mxfp4_use_marlin"
,
[
True
,
False
])
llm
=
vllm
.
LLM
(
def
test_gpt_oss_lora_tp2
(
MODEL_PATH
,
monkeypatch
:
pytest
.
MonkeyPatch
,
max_model_len
=
1024
,
gptoss20b_lora_files
,
enable_lora
=
True
,
fully_sharded_loras
,
max_loras
=
2
,
mxfp4_use_marlin
,
max_lora_rank
=
8
,
):
max_num_seqs
=
16
,
with
monkeypatch
.
context
()
as
m
:
tensor_parallel_size
=
2
,
m
.
setenv
(
"VLLM_MXFP4_USE_MARLIN"
,
"1"
if
mxfp4_use_marlin
else
"0"
)
fully_sharded_loras
=
fully_sharded_loras
,
llm
=
vllm
.
LLM
(
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
MODEL_PATH
,
cudagraph_specialize_lora
=
False
,
max_model_len
=
1024
,
),
enable_lora
=
True
,
)
max_loras
=
2
,
max_num_seqs
=
2
,
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
max_num_batched_tokens
=
2048
,
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
tensor_parallel_size
=
2
,
gpu_memory_utilization
=
0.8
,
fully_sharded_loras
=
fully_sharded_loras
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
1
)
generate_and_test
(
llm
,
gptoss20b_lora_files
,
lora_id
=
2
)
tests/lora/test_layers.py
View file @
7e63ef82
...
@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding
,
VocabParallelEmbedding
,
get_masked_input_and_mask
,
get_masked_input_and_mask
,
)
)
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
from
.utils
import
DummyLoRAManager
from
.utils
import
DummyLoRAManager
...
@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
...
@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_embeddings
(
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
def
test_embeddings
(
default_vllm_config
,
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925
# device, see: https://github.com/triton-lang/triton/issues/2925
# Same below.
# Same below.
...
@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
...
@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
max_loras
=
8
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_random_embedding_layer
():
def
create_random_embedding_layer
():
embedding
=
VocabParallelEmbedding
(
vocab_size
,
256
)
embedding
=
VocabParallelEmbedding
(
vocab_size
,
256
)
...
@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
...
@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
256512
])
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
256512
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_lm_head_logits_processor
(
def
test_lm_head_logits_processor
(
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
default_vllm_config
,
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
)
->
None
:
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
max_loras
=
8
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
_pretest
():
def
_pretest
():
linear
=
ParallelLMHead
(
linear
=
ParallelLMHead
(
...
@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
...
@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_linear_replicated
(
def
test_linear_replicated
(
default_vllm_config
,
dist_init
,
dist_init
,
num_loras
,
num_loras
,
device
,
device
,
...
@@ -480,13 +483,13 @@ def test_linear_replicated(
...
@@ -480,13 +483,13 @@ def test_linear_replicated(
max_loras
=
8
max_loras
=
8
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_loras
=
max_loras
,
max_lora_rank
=
8
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
,
lora_dtype
=
torch
.
float16
,
)
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_random_linear_replicated_layer
():
def
create_random_linear_replicated_layer
():
linear
=
ReplicatedLinear
(
4096
,
4096
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
linear
=
ReplicatedLinear
(
4096
,
4096
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
...
@@ -580,21 +583,21 @@ def test_linear_replicated(
...
@@ -580,21 +583,21 @@ def test_linear_replicated(
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_linear_parallel
(
def
test_linear_parallel
(
dist_init
,
num_loras
,
orientation
,
fully_shard
,
device
,
stage
default_vllm_config
,
dist_init
,
num_loras
,
orientation
,
fully_shard
,
device
,
stage
)
->
None
:
)
->
None
:
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
max_loras
=
8
max_loras
=
8
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_loras
=
max_loras
,
max_lora_rank
=
8
,
max_lora_rank
=
8
,
fully_sharded_loras
=
fully_shard
,
fully_sharded_loras
=
fully_shard
,
lora_dtype
=
torch
.
float16
,
lora_dtype
=
torch
.
float16
,
)
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_random_linear_parallel_layer
():
def
create_random_linear_parallel_layer
():
if
orientation
==
"row"
:
if
orientation
==
"row"
:
...
@@ -705,21 +708,21 @@ def test_linear_parallel(
...
@@ -705,21 +708,21 @@ def test_linear_parallel(
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_column_parallel_packed
(
def
test_column_parallel_packed
(
dist_init
,
num_loras
,
repeats
,
fully_shard
,
device
,
stage
default_vllm_config
,
dist_init
,
num_loras
,
repeats
,
fully_shard
,
device
,
stage
)
->
None
:
)
->
None
:
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
max_loras
=
8
max_loras
=
8
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_loras
=
max_loras
,
max_lora_rank
=
8
,
max_lora_rank
=
8
,
fully_sharded_loras
=
fully_shard
,
fully_sharded_loras
=
fully_shard
,
lora_dtype
=
torch
.
float16
,
lora_dtype
=
torch
.
float16
,
)
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
lora_config
=
lora_config
)
assert
check_punica_wrapper
(
punica_wrapper
)
def
create_column_parallel_packed_layer
():
def
create_column_parallel_packed_layer
():
if
repeats
==
2
:
if
repeats
==
2
:
...
@@ -851,7 +854,7 @@ def test_column_parallel_packed(
...
@@ -851,7 +854,7 @@ def test_column_parallel_packed(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
))
"seed"
,
list
(
range
(
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
))
)
)
def
test_vocab_parallel_embedding_indices
(
tp_size
,
seed
):
def
test_vocab_parallel_embedding_indices
(
tp_size
,
seed
,
default_vllm_config
):
random
.
seed
(
seed
)
random
.
seed
(
seed
)
vocab_size
=
random
.
randint
(
4000
,
64000
)
vocab_size
=
random
.
randint
(
4000
,
64000
)
added_vocab_size
=
random
.
randint
(
0
,
1024
)
added_vocab_size
=
random
.
randint
(
0
,
1024
)
...
...
tests/lora/test_llama_tp.py
View file @
7e63ef82
...
@@ -77,11 +77,18 @@ def do_sample(
...
@@ -77,11 +77,18 @@ def do_sample(
if
lora_id
if
lora_id
else
None
,
else
None
,
)
)
# Print the outputs.
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
generated_texts
:
list
[
str
]
=
[]
generated_texts
:
list
[
str
]
=
[]
for
output
in
outputs
:
for
output
in
outputs
:
prompt
=
output
.
prompt
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
generated_text
=
output
.
outputs
[
0
].
text
# The output should include correct lora_request info
if
lora_request
is
not
None
:
assert
output
.
lora_request
.
lora_name
==
lora_request
.
lora_name
assert
output
.
lora_request
.
lora_int_id
==
lora_request
.
lora_int_id
assert
output
.
lora_request
.
lora_path
==
lora_request
.
lora_path
else
:
assert
output
.
lora_request
is
None
generated_texts
.
append
(
generated_text
)
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
return
generated_texts
...
...
tests/lora/test_lora_manager.py
View file @
7e63ef82
...
@@ -18,6 +18,7 @@ from vllm.lora.layers import (
...
@@ -18,6 +18,7 @@ from vllm.lora.layers import (
from
vllm.lora.lora_model
import
LoRAModel
from
vllm.lora.lora_model
import
LoRAModel
from
vllm.lora.lora_weights
import
LoRALayerWeights
,
PackedLoRALayerWeights
from
vllm.lora.lora_weights
import
LoRALayerWeights
,
PackedLoRALayerWeights
from
vllm.lora.model_manager
import
(
from
vllm.lora.model_manager
import
(
DEFAULT_LANGUAGE_WRAPPER_KEY
,
LoRAMapping
,
LoRAMapping
,
LoRAModelManager
,
LoRAModelManager
,
LRUCacheLoRAModelManager
,
LRUCacheLoRAModelManager
,
...
@@ -110,7 +111,7 @@ def create_packed_lora(
...
@@ -110,7 +111,7 @@ def create_packed_lora(
return
LoRAModel
(
lora_id
,
8
,
loras
)
return
LoRAModel
(
lora_id
,
8
,
loras
)
def
test_replace_submodules
(
dist_init
,
dummy_model
):
def
test_replace_submodules
(
default_vllm_config
,
dist_init
,
dummy_model
):
model
=
dummy_model
model
=
dummy_model
manager
=
LoRAModelManager
(
manager
=
LoRAModelManager
(
model
,
model
,
...
@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model):
...
@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model):
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lora_model_manager
(
dist_init
,
dummy_model
,
device
):
def
test_lora_model_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
model
=
dummy_model
model
=
dummy_model
model_lora1
=
create_lora
(
model_lora1
=
create_lora
(
1
,
model
,
[
"layer1.dense1"
,
"dense2"
,
"lm_head"
],
device
=
device
1
,
model
,
[
"layer1.dense1"
,
"dense2"
,
"lm_head"
],
device
=
device
...
@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
...
@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert
manager
.
activate_adapter
(
2
)
assert
manager
.
activate_adapter
(
2
)
assert
manager
.
lora_index_to_id
[
0
]
==
3
assert
manager
.
lora_index_to_id
[
0
]
==
3
assert
manager
.
lora_index_to_id
[
1
]
==
2
assert
manager
.
lora_index_to_id
[
1
]
==
2
assert
manager
.
device
==
device
assert
manager
.
device
==
device
assert
manager
.
punica_wrapper
.
device
==
device
assert
(
manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
).
device
==
device
)
assert
hasattr
(
manager
,
"supported_lora_modules"
)
assert
hasattr
(
manager
,
"supported_lora_modules"
)
assert
sorted
(
manager
.
supported_lora_modules
)
==
[
assert
sorted
(
manager
.
supported_lora_modules
)
==
[
"dense1"
,
"dense1"
,
...
@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device):
...
@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device):
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lora_lru_cache_model_manager
(
dist_init
,
dummy_model
,
device
):
def
test_lora_lru_cache_model_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
model
=
dummy_model
model
=
dummy_model
model_lora1
=
create_lora
(
model_lora1
=
create_lora
(
1
,
model
,
[
"layer1.dense1"
,
"dense2"
,
"lm_head"
],
device
=
device
1
,
model
,
[
"layer1.dense1"
,
"dense2"
,
"lm_head"
],
device
=
device
...
@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
...
@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert
manager
.
remove_adapter
(
3
)
assert
manager
.
remove_adapter
(
3
)
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
assert
manager
.
pin_adapter
(
3
)
assert
manager
.
pin_adapter
(
3
)
assert
(
assert
manager
.
punica_wrapper
.
device
==
device
manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
).
device
==
device
)
assert
manager
.
device
==
device
assert
manager
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lru_lora_model_manager
(
dist_init
,
dummy_model
,
device
):
def
test_lru_lora_model_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
# This tests just the LRU cache functionality, everything else is
# This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager
# tested in test_lora_model_manager
model
=
dummy_model
model
=
dummy_model
...
@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
...
@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert
manager
.
remove_oldest_adapter
()
assert
manager
.
remove_oldest_adapter
()
assert
set
(
manager
.
list_adapters
())
==
{
1
}
assert
set
(
manager
.
list_adapters
())
==
{
1
}
assert
manager
.
punica_wrapper
.
device
==
device
assert
(
manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
).
device
==
device
)
assert
manager
.
device
==
device
assert
manager
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lru_cache_worker_adapter_manager
(
dist_init
,
dummy_model
,
device
,
tmp_path
):
def
test_lru_cache_worker_adapter_manager
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
,
tmp_path
):
lora_config
=
LoRAConfig
(
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
)
)
...
@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
...
@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
)
)
assert
worker_adapter_manager
.
device
==
device
assert
worker_adapter_manager
.
device
==
device
assert
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper
.
device
==
device
punica_wrapper
=
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert
punica_wrapper
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_worker_adapter_manager
(
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
def
test_worker_adapter_manager
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
# Should remove every LoRA not specified in the request.
# Should remove every LoRA not specified in the request.
lora_config
=
LoRAConfig
(
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
...
@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
...
@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
)
)
assert
worker_adapter_manager
.
device
==
device
assert
worker_adapter_manager
.
device
==
device
assert
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper
.
device
==
device
punica_wrapper
=
worker_adapter_manager
.
_adapter_manager
.
punica_wrapper_mapping
.
get
(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert
punica_wrapper
.
device
==
device
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_packed_loras
(
dist_init
,
dummy_model_gate_up
,
device
):
def
test_packed_loras
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
):
model
=
dummy_model_gate_up
model
=
dummy_model_gate_up
model_lora
=
create_packed_lora
(
model_lora
=
create_packed_lora
(
1
,
1
,
...
...
Prev
1
…
21
22
23
24
25
26
27
28
29
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment