Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
675ba75f
Commit
675ba75f
authored
Apr 07, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-ori
parents
5cc98918
296c6572
Changes
501
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
908 additions
and
731 deletions
+908
-731
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+134
-0
tests/kernels/test_cutlass_moe.py
tests/kernels/test_cutlass_moe.py
+244
-0
tests/kernels/test_ggml.py
tests/kernels/test_ggml.py
+2
-1
tests/kernels/test_gguf.py
tests/kernels/test_gguf.py
+2
-2
tests/kernels/test_lightning_attn.py
tests/kernels/test_lightning_attn.py
+286
-0
tests/kernels/test_mla_decode_cpu.py
tests/kernels/test_mla_decode_cpu.py
+94
-0
tests/kernels/test_moe.py
tests/kernels/test_moe.py
+19
-6
tests/kernels/test_prefix_prefill.py
tests/kernels/test_prefix_prefill.py
+4
-0
tests/kernels/test_uva.py
tests/kernels/test_uva.py
+61
-0
tests/lora/conftest.py
tests/lora/conftest.py
+0
-56
tests/lora/data/long_context_test_data.py
tests/lora/data/long_context_test_data.py
+0
-121
tests/lora/test_baichuan.py
tests/lora/test_baichuan.py
+0
-8
tests/lora/test_chatglm3_tp.py
tests/lora/test_chatglm3_tp.py
+8
-8
tests/lora/test_gemma.py
tests/lora/test_gemma.py
+0
-65
tests/lora/test_layers.py
tests/lora/test_layers.py
+23
-122
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+8
-29
tests/lora/test_long_context.py
tests/lora/test_long_context.py
+0
-301
tests/lora/test_lora_manager.py
tests/lora/test_lora_manager.py
+11
-3
tests/lora/test_minicpmv_tp.py
tests/lora/test_minicpmv_tp.py
+4
-1
tests/lora/test_phi.py
tests/lora/test_phi.py
+8
-8
No files found.
tests/kernels/test_cutlass.py
View file @
675ba75f
...
...
@@ -3,6 +3,7 @@
Run `pytest tests/kernels/test_cutlass.py`.
"""
import
random
import
pytest
import
torch
...
...
@@ -507,3 +508,136 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
def
test_cutlass_support_opcheck
():
opcheck
(
torch
.
ops
.
_C
.
cutlass_scaled_mm_supports_fp8
,
(
capability
,
))
@
pytest
.
mark
.
parametrize
(
"num_experts"
,
[
8
,
64
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
False
])
@
pytest
.
mark
.
skipif
(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
current_platform
.
get_device_capability
()),
reason
=
"Grouped gemm is not supported on this GPU type."
)
def
test_cutlass_fp8_group_gemm
(
num_experts
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
use_bias
:
bool
):
# Device and dtype setup
device
=
"cuda"
out_dtype
=
torch
.
half
# Create separate A, B, C tensors for each group
a_tensors
=
[]
b_tensors
=
[]
a_scales_tensors
=
[]
b_scales_tensors
=
[]
baseline_tensors
=
[]
expert_offsets
=
torch
.
zeros
((
num_experts
+
1
),
device
=
device
,
dtype
=
torch
.
int32
)
problem_sizes
=
torch
.
zeros
((
num_experts
,
3
),
device
=
device
,
dtype
=
torch
.
int32
)
if
not
per_act_token
:
one_scale_a
=
torch
.
randn
((
1
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
alignment
=
16
# 128 // 8
# For variation, each group has dimensions
n_g
=
alignment
*
random
.
randint
(
1
,
64
)
k_g
=
alignment
*
random
.
randint
(
1
,
64
)
for
g
in
range
(
num_experts
):
m_g
=
alignment
*
random
.
randint
(
1
,
64
)
expert_offsets
[
g
+
1
]
=
expert_offsets
[
g
]
+
m_g
problem_sizes
[
g
][
0
]
=
m_g
problem_sizes
[
g
][
1
]
=
n_g
problem_sizes
[
g
][
2
]
=
k_g
m_a_scales
=
m_g
if
per_act_token
else
1
n_b_scales
=
n_g
if
per_out_ch
else
1
print
(
"shape:"
,
m_g
,
n_g
,
k_g
)
# Create group-specific A and B (FP8) and output (FP16/FP32)
a_g
=
to_fp8
(
torch
.
randn
((
m_g
,
k_g
),
device
=
device
))
b_g
=
to_fp8
(
torch
.
randn
((
n_g
,
k_g
),
device
=
device
).
t
())
a_tensors
.
append
(
a_g
)
b_tensors
.
append
(
b_g
)
# Set up A/B scales
scale_b
=
torch
.
randn
((
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
b_scales_tensors
.
append
(
scale_b
)
if
per_act_token
:
scale_a
=
torch
.
randn
((
m_a_scales
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
a_scales_tensors
.
append
(
scale_a
)
else
:
scale_a
=
one_scale_a
# Compute baseline result for this group
baseline_g
=
baseline_scaled_mm
(
a_g
,
b_g
,
scale_a
,
scale_b
,
out_dtype
,
None
)
baseline_tensors
.
append
(
baseline_g
)
a_tensors_stacked
=
torch
.
empty
((
expert_offsets
[
num_experts
],
k_g
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
)
b_tensors_stacked
=
torch
.
empty
((
num_experts
,
n_g
,
k_g
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
)
for
g
in
range
(
num_experts
):
a_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
=
a_tensors
[
g
]
b_tensors_stacked
[
g
]
=
b_tensors
[
g
].
t
()
b_tensors_stacked
=
b_tensors_stacked
.
transpose
(
1
,
2
)
if
per_act_token
:
a_scales_tensors_stacked
=
torch
.
empty
(
(
expert_offsets
[
num_experts
],
1
),
device
=
device
,
dtype
=
torch
.
float32
)
for
g
in
range
(
num_experts
):
a_scales_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
=
a_scales_tensors
[
g
]
else
:
a_scales_tensors_stacked
=
one_scale_a
b_scales_tensors_stacked
=
torch
.
empty
((
num_experts
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
for
g
in
range
(
num_experts
):
b_scales_tensors_stacked
[
g
]
=
b_scales_tensors
[
g
]
out_tensors_stacked
=
torch
.
zeros
((
expert_offsets
[
num_experts
],
n_g
),
device
=
device
,
dtype
=
out_dtype
)
ab_strides
=
torch
.
full
((
num_experts
,
),
a_tensors_stacked
.
stride
(
0
),
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides
=
torch
.
full
((
num_experts
,
),
out_tensors_stacked
.
stride
(
0
),
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ops
.
cutlass_moe_mm
(
out_tensors_stacked
,
a_tensors_stacked
,
b_tensors_stacked
,
a_scales_tensors_stacked
,
b_scales_tensors_stacked
,
expert_offsets
[:
-
1
],
problem_sizes
,
ab_strides
,
ab_strides
,
c_strides
)
# Validate each group's result against the baseline
for
g
in
range
(
num_experts
):
baseline
=
baseline_tensors
[
g
]
c
=
out_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
print
(
baseline
)
print
(
c
)
print
(
"*"
)
torch
.
testing
.
assert_close
(
c
,
baseline
,
rtol
=
1e-2
,
atol
=
5e-4
)
tests/kernels/test_cutlass_moe.py
0 → 100644
View file @
675ba75f
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
cutlass_moe_fp8
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
fused_experts
,
fused_topk
)
from
vllm.platforms
import
current_platform
NUM_EXPERTS
=
[
40
,
64
]
TOP_KS
=
[
6
,
8
]
def
run
(
a
:
torch
.
Tensor
,
a_scale
:
torch
.
Tensor
,
w1_q
:
torch
.
Tensor
,
w2_q
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
ab_strides1
:
torch
.
Tensor
,
c_strides1
:
torch
.
Tensor
,
ab_strides2
:
torch
.
Tensor
,
c_strides2
:
torch
.
Tensor
):
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))):
return
cutlass_moe_fp8
(
a
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
,
a1_scale
=
a_scale
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
2
,
64
,
224
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
1024
,
3072
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1024
,
1536
])
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
current_platform
.
get_device_capability
()),
reason
=
"Grouped gemm is not supported on this GPU type."
)
def
test_cutlass_moe_no_graph
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
):
current_platform
.
seed_everything
(
7
)
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))):
dtype
=
torch
.
half
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
# Get the right scale for tests.
_
,
a_scale1
=
ops
.
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
per_act_token
)
a_q
,
_
=
ops
.
scaled_fp8_quant
(
a
,
a_scale1
,
use_per_token_if_dynamic
=
per_act_token
)
a_d
=
a_q
.
float
().
mul
(
a_scale1
).
to
(
dtype
)
n_b_scales
=
2
*
n
if
per_out_ch
else
1
k_b_scales
=
k
if
per_out_ch
else
1
w1_q
=
torch
.
empty
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w2_q
=
torch
.
empty
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w1_scale
=
torch
.
empty
((
e
,
n_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
empty
((
e
,
k_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
for
expert
in
range
(
e
):
w1_q
[
expert
],
w1_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w2_q
[
expert
],
w2_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w1_q
=
w1_q
.
transpose
(
1
,
2
)
w2_q
=
w2_q
.
transpose
(
1
,
2
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
w1_d
=
torch
.
empty_like
(
w1
)
w2_d
=
torch
.
empty_like
(
w2
)
for
expert
in
range
(
e
):
w1_d
[
expert
]
=
(
w1_q
[
expert
].
t
().
float
()
*
w1_scale
[
expert
]).
half
()
w2_d
[
expert
]
=
(
w2_q
[
expert
].
t
().
float
()
*
w2_scale
[
expert
]).
half
()
score
=
torch
.
randn
((
m
,
e
),
device
=
"cuda"
,
dtype
=
dtype
)
topk_weights
,
topk_ids
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
triton_output
=
fused_experts
(
a_d
,
w1_d
,
w2_d
,
topk_weights
,
topk_ids
)
cutlass_output
=
cutlass_moe_fp8
(
a
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
,
a1_scale
=
a_scale1
)
#print(triton_output)
#print(cutlass_output)
#print("*")
torch
.
testing
.
assert_close
(
triton_output
,
cutlass_output
,
atol
=
5e-2
,
rtol
=
1e-2
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
2
,
64
,
224
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
1024
,
3072
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1024
,
1536
])
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
current_platform
.
get_device_capability
()),
reason
=
"Grouped gemm is not supported on this GPU type."
)
def
test_cutlass_moe_cuda_graph
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
):
current_platform
.
seed_everything
(
7
)
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))):
dtype
=
torch
.
half
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
# Get the right scale for tests.
_
,
a_scale1
=
ops
.
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
per_act_token
)
a_q
,
_
=
ops
.
scaled_fp8_quant
(
a
,
a_scale1
,
use_per_token_if_dynamic
=
per_act_token
)
a_d
=
a_q
.
float
().
mul
(
a_scale1
).
to
(
dtype
)
n_b_scales
=
2
*
n
if
per_out_ch
else
1
k_b_scales
=
k
if
per_out_ch
else
1
w1_q
=
torch
.
empty
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w2_q
=
torch
.
empty
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w1_scale
=
torch
.
empty
((
e
,
n_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
empty
((
e
,
k_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
for
expert
in
range
(
e
):
w1_q
[
expert
],
w1_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w2_q
[
expert
],
w2_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w1_q
=
w1_q
.
transpose
(
1
,
2
)
w2_q
=
w2_q
.
transpose
(
1
,
2
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
w1_d
=
torch
.
empty_like
(
w1
)
w2_d
=
torch
.
empty_like
(
w2
)
for
expert
in
range
(
e
):
w1_d
[
expert
]
=
(
w1_q
[
expert
].
t
().
float
()
*
w1_scale
[
expert
]).
half
()
w2_d
[
expert
]
=
(
w2_q
[
expert
].
t
().
float
()
*
w2_scale
[
expert
]).
half
()
score
=
torch
.
randn
((
m
,
e
),
device
=
"cuda"
,
dtype
=
dtype
)
topk_weights
,
topk_ids
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
triton_output
=
fused_experts
(
a_d
,
w1_d
,
w2_d
,
topk_weights
,
topk_ids
)
stream
=
torch
.
cuda
.
Stream
()
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
,
stream
=
stream
):
cutlass_output
=
run
(
a
,
a_scale1
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
)
torch
.
cuda
.
synchronize
()
graph
.
replay
()
torch
.
cuda
.
synchronize
()
#print(triton_output)
#print(cutlass_output)
#print("*")
torch
.
testing
.
assert_close
(
triton_output
,
cutlass_output
,
atol
=
9e-2
,
rtol
=
1e-2
)
tests/kernels/test_ggml.py
View file @
675ba75f
...
...
@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
qweight
=
torch
.
randint
(
0
,
100
,
shape
,
device
=
'cuda'
,
dtype
=
torch
.
uint8
)
m
=
qweight
.
shape
[
0
]
n
=
qweight
.
shape
[
1
]
//
type_size
*
block_size
opcheck
(
torch
.
ops
.
_C
.
ggml_dequantize
,
(
qweight
,
quant_type
,
m
,
n
))
opcheck
(
torch
.
ops
.
_C
.
ggml_dequantize
,
(
qweight
,
quant_type
,
m
,
n
,
torch
.
float16
))
x
=
torch
.
rand
((
m
,
512
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
opcheck
(
torch
.
ops
.
_C
.
ggml_mul_mat_a8
,
...
...
tests/kernels/test_gguf.py
View file @
675ba75f
...
...
@@ -65,7 +65,7 @@ QUANT_TYPES = [
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
half
]
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
QUANT_TYPES
)
@
torch
.
inference_mode
()
def
test_dequantize
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
...
...
@@ -78,7 +78,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
ref_output
=
torch
.
tensor
(
dequantize
(
tensor
.
data
,
quant_type
),
device
=
"cuda"
).
to
(
dtype
)
output
=
ops
.
ggml_dequantize
(
torch
.
tensor
(
tensor
.
data
,
device
=
"cuda"
),
quant_type
,
*
list
(
shape
)
).
to
(
dtype
)
quant_type
,
*
list
(
shape
)
,
dtype
)
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
4e-2
)
...
...
tests/kernels/test_lightning_attn.py
0 → 100644
View file @
675ba75f
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.model_executor.layers.lightning_attn
import
(
linear_decode_forward_triton
)
from
vllm.platforms
import
current_platform
NUM_HEADS
=
[
4
,
8
]
HEAD_SIZES
=
[
64
]
BATCH_SIZES
=
[
1
,
2
]
SEQ_LENGTHS
=
[
16
]
DTYPES
=
[
torch
.
float32
]
def
reference_lightning_attention
(
q
,
k
,
v
,
ed
,
block_size
,
kv_history
):
"""Reference implementation of lightning attention core algorithm
The difference from the main implementation is that this processes
each step sequentially, instead of using parallelized triton kernels
"""
B
,
H
,
S
,
D
=
q
.
shape
E
=
v
.
shape
[
-
1
]
dtype
=
q
.
dtype
output
=
torch
.
zeros
((
B
,
H
,
S
,
E
),
dtype
=
dtype
,
device
=
q
.
device
)
# Use clone() to ensure an independent copy
if
kv_history
is
None
:
kv_cache
=
torch
.
zeros
((
B
,
H
,
D
,
E
),
dtype
=
dtype
,
device
=
q
.
device
)
else
:
kv_cache
=
kv_history
.
clone
()
# More efficient implementation
# Convert decay factors to matrix form
if
ed
.
dim
()
==
1
:
decay
=
torch
.
exp
(
-
ed
).
view
(
1
,
-
1
,
1
,
1
)
else
:
decay
=
torch
.
exp
(
-
ed
)
for
b
in
range
(
B
):
for
step
in
range
(
S
):
# Process all heads at once for this position
q_bs
=
q
[
b
,
:,
step
]
# [H, D]
k_bs
=
k
[
b
,
:,
step
]
# [H, D]
v_bs
=
v
[
b
,
:,
step
]
# [H, E]
# Calculate KV outer products for all heads
for
h
in
range
(
H
):
# Calculate KV outer product
kv_outer
=
torch
.
outer
(
k_bs
[
h
],
v_bs
[
h
])
# Update KV cache with decay
# Note: Using the same order as in the Triton kernel
kv_cache
[
b
,
h
]
=
decay
[
0
,
h
,
0
,
0
]
*
kv_cache
[
b
,
h
]
+
kv_outer
# Calculate attention output
output
[
b
,
h
,
step
]
=
torch
.
matmul
(
q_bs
[
h
],
kv_cache
[
b
,
h
])
# Match the shape returned by the actual implementation
# The actual implementation returns a tensor of shape [B, H, 2, D, E]
# where dimension 2 contains both KV and KV history
kv_reshaped
=
kv_cache
.
unsqueeze
(
2
)
# [B, H, 1, D, E]
final_kv_cache
=
torch
.
cat
([
kv_reshaped
,
kv_reshaped
],
dim
=
2
)
# [B, H, 2, D, E]
return
output
,
final_kv_cache
def
reference_linear_decode
(
q
,
k
,
v
,
kv_caches
,
slope_rate
,
slot_idx
):
"""Reference implementation: linear attention decode function"""
B
,
H
,
_
,
D
=
q
.
shape
output
=
torch
.
zeros
(
B
,
H
*
D
,
dtype
=
q
.
dtype
,
device
=
q
.
device
)
# Calculate decay factors once (more efficient)
decay
=
torch
.
exp
(
-
slope_rate
).
view
(
-
1
,
1
,
1
)
# [H, 1, 1]
# Process each batch
for
b
in
range
(
B
):
slot_id
=
slot_idx
[
b
].
item
()
# Skip padding positions
if
slot_id
==
-
1
:
continue
# Process all heads at once for this batch
q_b
=
q
[
b
,
:,
0
]
# [H, D]
k_b
=
k
[
b
,
:,
0
]
# [H, D]
v_b
=
v
[
b
,
:,
0
]
# [H, D]
# Process each attention head
for
h
in
range
(
H
):
# Get current query, key and value
q_bh
=
q_b
[
h
]
k_bh
=
k_b
[
h
]
v_bh
=
v_b
[
h
]
# Get cache
kv_cache_old
=
kv_caches
[
b
,
h
]
# Calculate new key-value outer product
kv_outer
=
torch
.
outer
(
k_bh
,
v_bh
)
# Apply decay and update cache
kv_new
=
kv_outer
+
decay
[
h
,
0
,
0
]
*
kv_cache_old
# Calculate output
out_h
=
torch
.
matmul
(
q_bh
,
kv_new
)
# Update output and cache
output
[
b
,
h
*
D
:(
h
+
1
)
*
D
]
=
out_h
kv_caches
[
b
,
h
]
=
kv_new
return
output
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_linear_decode_forward_triton
(
batch_size
:
int
,
num_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
current_platform
.
seed_everything
(
42
)
base
=
0.01
q
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
k
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
v
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
kv_caches
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
head_size
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
kv_caches_copy
=
kv_caches
.
clone
()
slope_rate
=
torch
.
zeros
(
num_heads
,
device
=
"cuda"
)
for
h
in
range
(
num_heads
):
slope_rate
[
h
]
=
0.1
*
(
h
+
1
)
slot_idx
=
torch
.
arange
(
batch_size
,
device
=
"cuda"
)
triton_output
=
linear_decode_forward_triton
(
q
,
k
,
v
,
kv_caches
,
slope_rate
,
slot_idx
)
reference_output
=
reference_linear_decode
(
q
,
k
,
v
,
kv_caches_copy
,
slope_rate
,
slot_idx
)
torch
.
testing
.
assert_close
(
triton_output
,
reference_output
,
rtol
=
1e-1
,
atol
=
1e-1
)
torch
.
testing
.
assert_close
(
kv_caches
,
kv_caches_copy
,
rtol
=
1e-1
,
atol
=
1e-1
)
assert
triton_output
.
shape
==
(
batch_size
,
num_heads
*
head_size
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_linear_decode_forward_triton_with_padding
(
num_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
current_platform
.
seed_everything
(
42
)
batch_size
=
4
base
=
0.01
q
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
k
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
v
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
kv_caches
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
head_size
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
kv_caches_copy
=
kv_caches
.
clone
()
slope_rate
=
torch
.
zeros
(
num_heads
,
device
=
"cuda"
)
for
h
in
range
(
num_heads
):
slope_rate
[
h
]
=
0.1
*
(
h
+
1
)
slot_idx
=
torch
.
tensor
([
0
,
1
,
-
1
,
2
],
device
=
"cuda"
)
triton_output
=
linear_decode_forward_triton
(
q
,
k
,
v
,
kv_caches
,
slope_rate
,
slot_idx
)
reference_output
=
reference_linear_decode
(
q
,
k
,
v
,
kv_caches_copy
,
slope_rate
,
slot_idx
)
padding_mask
=
(
slot_idx
!=
-
1
).
unsqueeze
(
1
).
expand
(
-
1
,
num_heads
*
head_size
)
triton_masked
=
triton_output
[
padding_mask
]
reference_masked
=
reference_output
[
padding_mask
]
atol
,
rtol
=
1.5e-1
,
1.5e-1
valid_indices
=
slot_idx
!=
-
1
for
i
in
range
(
batch_size
):
if
valid_indices
[
i
]
>
0
:
torch
.
testing
.
assert_close
(
kv_caches
[
i
],
kv_caches_copy
[
i
],
rtol
=
rtol
,
atol
=
atol
)
torch
.
testing
.
assert_close
(
triton_masked
,
reference_masked
,
rtol
=
rtol
,
atol
=
atol
)
assert
triton_output
.
shape
==
(
batch_size
,
num_heads
*
head_size
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
SEQ_LENGTHS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_lightning_attention_reference
(
batch_size
:
int
,
num_heads
:
int
,
head_size
:
int
,
seq_len
:
int
,
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
current_platform
.
seed_everything
(
42
)
base
=
0.01
q
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
seq_len
,
head_size
,
dtype
=
dtype
)
k
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
seq_len
,
head_size
,
dtype
=
dtype
)
v
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
seq_len
,
head_size
,
dtype
=
dtype
)
ed
=
torch
.
zeros
(
num_heads
,
device
=
"cuda"
)
for
h
in
range
(
num_heads
):
ed
[
h
]
=
0.1
*
(
h
+
1
)
kv_history
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
head_size
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
kv_history_clone
=
kv_history
.
clone
()
ref_output
,
ref_kv_cache
=
reference_lightning_attention
(
q
,
k
,
v
,
ed
,
256
,
kv_history
)
from
vllm.model_executor.layers.lightning_attn
import
lightning_attention
actual_output
,
actual_kv_cache
=
lightning_attention
(
q
,
k
,
v
,
ed
,
256
,
kv_history_clone
)
atol
,
rtol
=
1.5e-1
,
1.5e-1
torch
.
testing
.
assert_close
(
ref_output
,
actual_output
,
rtol
=
rtol
,
atol
=
atol
)
torch
.
testing
.
assert_close
(
ref_kv_cache
,
actual_kv_cache
,
rtol
=
rtol
,
atol
=
atol
)
assert
ref_output
.
shape
==
(
batch_size
,
num_heads
,
seq_len
,
head_size
)
assert
ref_kv_cache
.
shape
==
actual_kv_cache
.
shape
tests/kernels/test_mla_decode_cpu.py
0 → 100644
View file @
675ba75f
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
import
vllm._custom_ops
as
ops
from
vllm.platforms
import
current_platform
def
cdiv
(
a
,
b
):
return
(
a
+
b
-
1
)
//
b
def
ref_mla
(
out
:
Tensor
,
# (bs, num_heads, v_head_dim)
query
:
Tensor
,
# (bs, num_heads, head_dim)
kv_cache
:
Tensor
,
# (num_blocks, block_size, head_dim)
scale
:
float
,
block_tables
:
Tensor
,
# (bs, max_num_blocks)
seq_lens
:
Tensor
,
# (bs,)
):
bs
,
num_heads
,
v_head_dim
=
out
.
shape
head_dim
=
query
.
shape
[
2
]
for
i
in
range
(
bs
):
# gather and flatten KV-cache
kv
=
kv_cache
[
block_tables
[
i
]]
# (max_num_blocks, block_size, head_dim)
kv
=
kv
.
view
(
1
,
-
1
,
head_dim
)[:,
:
seq_lens
[
i
]]
# (1, seq_len, head_dim)
v
=
kv
[:,
:,
:
v_head_dim
]
q
=
query
[
i
].
view
(
num_heads
,
1
,
head_dim
)
o
=
F
.
scaled_dot_product_attention
(
q
,
kv
,
v
,
scale
=
scale
,
enable_gqa
=
True
)
out
[
i
]
=
o
.
view
(
num_heads
,
v_head_dim
)
return
out
@
pytest
.
mark
.
parametrize
(
"bs"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"mean_seq_len"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"h_q"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"d"
,
[
576
])
@
pytest
.
mark
.
parametrize
(
"dv"
,
[
512
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float
,
torch
.
half
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"varlen"
,
[
False
,
True
])
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
(),
reason
=
"CPU only"
)
def
test_mla_decode_cpu
(
bs
:
int
,
mean_seq_len
:
int
,
h_q
:
int
,
d
:
int
,
dv
:
int
,
block_size
:
int
,
dtype
:
torch
.
dtype
,
varlen
:
bool
,
):
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
0
)
scale
=
d
**
(
-
0.5
)
if
varlen
:
seq_lens
=
torch
.
empty
(
bs
).
normal_
(
mean_seq_len
,
mean_seq_len
/
2
)
seq_lens
=
seq_lens
.
clip
(
2
).
to
(
torch
.
int32
)
else
:
seq_lens
=
torch
.
full
((
bs
,
),
mean_seq_len
,
dtype
=
torch
.
int32
)
max_seq_len
=
seq_lens
.
max
().
item
()
seqlen_pad
=
cdiv
(
max_seq_len
,
256
)
*
256
# is this necessary?
q
=
torch
.
randn
(
bs
,
h_q
,
d
)
block_table
=
torch
.
arange
(
bs
*
seqlen_pad
//
block_size
,
dtype
=
torch
.
int32
)
block_table
=
block_table
.
view
(
bs
,
seqlen_pad
//
block_size
)
kv_cache
=
torch
.
randn
(
block_table
.
numel
(),
block_size
,
d
)
for
i
,
seq_len
in
enumerate
(
seq_lens
.
tolist
()):
kv_cache
.
view
(
bs
,
seqlen_pad
,
d
)[
i
,
seq_len
:]
=
float
(
"nan"
)
out_mla
=
q
.
new_zeros
(
bs
,
h_q
,
dv
)
ops
.
mla_decode_kvcache_cpu
(
out_mla
,
q
,
kv_cache
,
scale
,
block_table
,
seq_lens
)
out_ref
=
q
.
new_zeros
(
bs
,
h_q
,
dv
)
ref_mla
(
out_ref
,
q
,
kv_cache
,
scale
,
block_table
,
seq_lens
)
assert
not
out_mla
.
isnan
().
any
(),
"Likely read out of bounds"
torch
.
testing
.
assert_close
(
out_mla
,
out_ref
)
tests/kernels/test_moe.py
View file @
675ba75f
...
...
@@ -3,7 +3,6 @@
Run `pytest tests/kernels/test_moe.py`.
"""
import
pytest
import
torch
from
torch.nn
import
Parameter
...
...
@@ -216,11 +215,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"padding"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
@
torch
.
inference_mode
()
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
,
padding
:
bool
):
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
,
padding
:
bool
,
use_rocm_aiter
:
bool
,
monkeypatch
):
"""Make sure our Mixtral MoE implementation agrees with the one from
huggingface."""
if
use_rocm_aiter
:
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
# Instantiate our and huggingface's MoE blocks
config
=
MixtralConfig
()
hf_moe
=
MixtralSparseMoeBlock
(
config
).
to
(
dtype
).
to
(
"cuda"
)
...
...
@@ -268,10 +273,18 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
torch
.
bfloat16
:
1e-2
,
}
torch
.
testing
.
assert_close
(
hf_states
.
flatten
(
0
,
1
),
vllm_states
,
rtol
=
mixtral_moe_tol
[
dtype
],
atol
=
mixtral_moe_tol
[
dtype
])
if
use_rocm_aiter
:
# The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501
torch
.
testing
.
assert_close
(
hf_states
.
flatten
(
0
,
1
),
vllm_states
,
rtol
=
0.01
,
atol
=
100
)
else
:
torch
.
testing
.
assert_close
(
hf_states
.
flatten
(
0
,
1
),
vllm_states
,
rtol
=
mixtral_moe_tol
[
dtype
],
atol
=
mixtral_moe_tol
[
dtype
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
33
,
64
,
222
])
...
...
tests/kernels/test_prefix_prefill.py
View file @
675ba75f
...
...
@@ -164,6 +164,7 @@ def test_contexted_kv_attention(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
@@ -180,6 +181,7 @@ def test_contexted_kv_attention(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
@@ -397,6 +399,7 @@ def test_contexted_kv_attention_alibi(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
@@ -413,6 +416,7 @@ def test_contexted_kv_attention_alibi(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
tests/kernels/test_uva.py
0 → 100644
View file @
675ba75f
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.utils
import
get_cuda_view_from_cpu_tensor
,
is_uva_available
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
skipif
(
not
is_uva_available
(),
reason
=
"UVA is not available."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_cpu_write
(
device
):
torch
.
set_default_device
(
device
)
cpu_tensor
=
torch
.
zeros
(
10
,
10
,
device
=
"cpu"
,
pin_memory
=
True
,
dtype
=
torch
.
int32
)
cuda_view
=
get_cuda_view_from_cpu_tensor
(
cpu_tensor
)
assert
cuda_view
.
device
.
type
==
"cuda"
assert
cuda_view
[
0
,
0
]
==
0
assert
cuda_view
[
2
,
3
]
==
0
assert
cuda_view
[
4
,
5
]
==
0
cpu_tensor
[
0
,
0
]
=
1
cpu_tensor
[
2
,
3
]
=
2
cpu_tensor
[
4
,
5
]
=
-
1
cuda_view
.
mul_
(
2
)
assert
cuda_view
[
0
,
0
]
==
2
assert
cuda_view
[
2
,
3
]
==
4
assert
cuda_view
[
4
,
5
]
==
-
2
@
pytest
.
mark
.
skipif
(
not
is_uva_available
(),
reason
=
"UVA is not available."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_gpu_write
(
device
):
torch
.
set_default_device
(
device
)
cpu_tensor
=
torch
.
zeros
(
10
,
10
,
device
=
"cpu"
,
pin_memory
=
True
,
dtype
=
torch
.
int32
)
cuda_view
=
get_cuda_view_from_cpu_tensor
(
cpu_tensor
)
assert
cuda_view
.
device
.
type
==
"cuda"
assert
cuda_view
[
0
,
0
]
==
0
assert
cuda_view
[
2
,
3
]
==
0
assert
cuda_view
[
4
,
5
]
==
0
cuda_view
[
0
,
0
]
=
1
cuda_view
[
2
,
3
]
=
2
cuda_view
[
4
,
5
]
=
-
1
cuda_view
.
mul_
(
2
)
assert
cpu_tensor
[
0
,
0
]
==
2
assert
cpu_tensor
[
2
,
3
]
==
4
assert
cpu_tensor
[
4
,
5
]
==
-
2
\ No newline at end of file
tests/lora/conftest.py
View file @
675ba75f
...
...
@@ -2,7 +2,6 @@
import
tempfile
from
collections
import
OrderedDict
from
typing
import
TypedDict
from
unittest.mock
import
MagicMock
,
patch
import
pytest
...
...
@@ -26,28 +25,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA
from
vllm.platforms
import
current_platform
class
ContextIDInfo
(
TypedDict
):
lora_id
:
int
context_length
:
str
class
ContextInfo
(
TypedDict
):
lora
:
str
context_length
:
str
LONG_LORA_INFOS
:
list
[
ContextIDInfo
]
=
[{
"lora_id"
:
1
,
"context_length"
:
"16k"
,
},
{
"lora_id"
:
2
,
"context_length"
:
"16k"
,
},
{
"lora_id"
:
3
,
"context_length"
:
"32k"
,
}]
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
...
...
@@ -241,39 +218,6 @@ def long_context_lora_files_16k_1():
return
snapshot_download
(
repo_id
=
"SangBinCho/long_context_16k_testing_1"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
long_context_lora_files_16k_2
():
return
snapshot_download
(
repo_id
=
"SangBinCho/long_context_16k_testing_2"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
long_context_lora_files_32k
():
return
snapshot_download
(
repo_id
=
"SangBinCho/long_context_32k_testing"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
long_context_infos
(
long_context_lora_files_16k_1
,
long_context_lora_files_16k_2
,
long_context_lora_files_32k
):
cleanup_dist_env_and_memory
(
shutdown_ray
=
True
)
infos
:
dict
[
int
,
ContextInfo
]
=
{}
for
lora_checkpoint_info
in
LONG_LORA_INFOS
:
lora_id
=
lora_checkpoint_info
[
"lora_id"
]
if
lora_id
==
1
:
lora
=
long_context_lora_files_16k_1
elif
lora_id
==
2
:
lora
=
long_context_lora_files_16k_2
elif
lora_id
==
3
:
lora
=
long_context_lora_files_32k
else
:
raise
AssertionError
(
"Unknown lora id"
)
infos
[
lora_id
]
=
{
"context_length"
:
lora_checkpoint_info
[
"context_length"
],
"lora"
:
lora
,
}
return
infos
@
pytest
.
fixture
def
llama_2_7b_engine_extra_embeddings
():
cleanup_dist_env_and_memory
(
shutdown_ray
=
True
)
...
...
tests/lora/data/long_context_test_data.py
deleted
100644 → 0
View file @
5cc98918
This source diff could not be displayed because it is too large. You can
view the blob
instead.
tests/lora/test_baichuan.py
View file @
675ba75f
...
...
@@ -40,14 +40,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
test_baichuan_lora
(
baichuan_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
...
...
tests/lora/test_chatglm3_tp.py
View file @
675ba75f
...
...
@@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
PROMPT_TEMPLATE
.
format
(
query
=
"How many singers do we have?"
),
...
...
@@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
create_new_process_for_each_test
()
def
test_chatglm3_lora
(
chatglm3_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
...
...
tests/lora/test_gemma.py
deleted
100644 → 0
View file @
5cc98918
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
vllm
from
vllm.lora.request
import
LoRARequest
from
vllm.platforms
import
current_platform
MODEL_PATH
=
"google/gemma-7b"
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
"Quote: Imagination is"
,
"Quote: Be yourself;"
,
"Quote: Painting is poetry that is seen rather than felt,"
,
]
sampling_params
=
vllm
.
SamplingParams
(
temperature
=
0
,
max_tokens
=
32
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
)
# Print the outputs.
generated_texts
:
list
[
str
]
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
.
strip
()
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# The V1 lora test for this model requires more than 24GB.
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"There can be output mismatch on ROCm"
)
def
test_gemma_lora
(
gemma_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
enable_lora
=
True
,
max_loras
=
4
,
enable_chunked_prefill
=
True
)
expected_lora_output
=
[
"more important than knowledge.
\n
Author: Albert Einstein
\n
"
,
"everyone else is already taken.
\n
Author: Oscar Wilde
\n
"
,
"and poetry is painting that is felt rather than seen.
\n
"
"Author: Leonardo da Vinci
\n
"
,
]
output1
=
do_sample
(
llm
,
gemma_lora_files
,
lora_id
=
1
)
for
i
in
range
(
len
(
expected_lora_output
)):
assert
output1
[
i
].
startswith
(
expected_lora_output
[
i
])
output2
=
do_sample
(
llm
,
gemma_lora_files
,
lora_id
=
2
)
for
i
in
range
(
len
(
expected_lora_output
)):
assert
output2
[
i
].
startswith
(
expected_lora_output
[
i
])
tests/lora/test_layers.py
View file @
675ba75f
# SPDX-License-Identifier: Apache-2.0
import
importlib
import
random
from
copy
import
deepcopy
from
dataclasses
import
dataclass
...
...
@@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.lora.layers
import
(
BaseLayerWithLoRA
,
ColumnParallelLinearWithLoRA
,
LinearScalingRotaryEmbeddingWithLoRA
,
LogitsProcessorWithLoRA
,
LoRAMapping
,
MergedColumnParallelLinearWithLoRA
,
MergedQKVParallelLinearWithLoRA
,
...
...
@@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA
,
VocabParallelEmbeddingWithLoRA
)
# yapf: enable
from
vllm.lora.models
import
(
LongContextLoRAContext
,
LoRALayerWeights
,
PackedLoRALayerWeights
)
from
vllm.lora.models
import
LoRALayerWeights
,
PackedLoRALayerWeights
from
vllm.lora.punica_wrapper
import
get_punica_wrapper
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
...
...
@@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
get_masked_input_and_mask
)
from
vllm.model_executor.utils
import
set_random_seed
...
...
@@ -60,32 +56,16 @@ DEVICES = ([
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
if
current_platform
.
is_cuda_alike
()
else
[
"cpu"
])
#For GPU, we will launch different triton kernels between the prefill and decode
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
# prefill stage(True) or decode stage(False)
STAGES
=
[
True
,
False
]
# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
# the tests in this file run twice, once with the V0 engine and then with
# the V1 engine.
# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
# with the inclusion of V1 tests to maintain the CI test times.
NUM_RANDOM_SEEDS
=
5
# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
# 256 before. It is cut to half with the inclusion of V1 tests to maintain
# the CI test times.
NUM_RANDOM_SEEDS
=
6
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
=
128
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
# Reload punica_gpu as the kernels used are tied to engine type.
from
vllm.lora.punica_wrapper
import
punica_gpu
importlib
.
reload
(
punica_gpu
)
def
clean_cache
():
# Release any memory we might be holding on to. CI runs OOMs otherwise.
from
vllm.lora.ops.triton_ops.utils
import
(
_LORA_A_PTR_DICT
,
_LORA_B_PTR_DICT
)
...
...
@@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora):
yield
@
pytest
.
fixture
(
autouse
=
True
)
def
skip_cuda_with_stage_false
(
request
):
"""
On cuda-like platforms, we use the same kernels for prefill and decode
stage, and 'stage' is generally ignored, so we only need to test once.
"""
if
current_platform
.
is_cuda_alike
():
try
:
if
hasattr
(
request
.
node
,
"callspec"
)
and
hasattr
(
request
.
node
.
callspec
,
"params"
):
params
=
request
.
node
.
callspec
.
params
if
"stage"
in
params
and
params
[
"stage"
]
is
False
:
pytest
.
skip
(
"Skip test when stage=False"
)
except
Exception
:
pass
yield
def
get_random_id_to_index
(
num_loras
:
int
,
num_slots
:
int
,
log
:
bool
=
True
)
->
list
[
Optional
[
int
]]:
...
...
@@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
atol
=
atol
)
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"scaling_factors"
,
[(
1.0
,
),
(
4.0
,
),
(
4.0
,
8.0
),
(
6.0
,
1.0
)])
@
pytest
.
mark
.
parametrize
(
"max_position"
,
[
11
,
4096
,
32768
])
@
pytest
.
mark
.
parametrize
(
"is_neox_style"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"rotary_dim"
,
[
None
,
32
])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
32
,
108
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
11
,
1024
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda_alike
(),
reason
=
"Only CUDA backends are supported"
)
def
test_rotary_embedding_long_context
(
dist_init
,
num_loras
,
device
,
scaling_factors
,
max_position
,
is_neox_style
,
rotary_dim
,
head_size
,
seq_len
)
->
None
:
dtype
=
torch
.
float16
max_loras
=
8
seed
=
0
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
long_lora_scaling_factors
=
scaling_factors
,
lora_dtype
=
dtype
)
if
rotary_dim
is
None
:
rotary_dim
=
head_size
base
=
10000
batch_size
=
5
*
num_loras
num_heads
=
7
# Verify lora is equivalent to linear scaling rotary embedding.
rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
)
lora_rope
=
LinearScalingRotaryEmbeddingWithLoRA
(
rope
)
lora_rope
.
set_mapping
(
punica_wrapper
)
lora_rope
.
create_lora_weights
(
max_loras
,
lora_config
)
linear_rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
{
"rope_type"
:
"linear"
,
"factor"
:
scaling_factors
})
linear_rope
=
linear_rope
.
to
(
dtype
=
dtype
)
id_to_index
=
get_random_id_to_index
(
num_loras
,
max_loras
)
_
,
index_mapping
,
prompt_mapping
=
create_random_inputs
(
active_lora_ids
=
[
0
],
num_inputs
=
batch_size
,
input_size
=
(
1
,
max_position
),
input_range
=
(
0
,
lora_config
.
lora_extra_vocab_size
),
input_type
=
torch
.
float16
,
device
=
device
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
long_lora_context
=
LongContextLoRAContext
(
list
(
scaling_factors
),
rotary_dim
)
next_expected_offset
=
0
# Make sure the offset is correct.
scaling_factor_to_offset
=
lora_rope
.
scaling_factor_to_offset
for
scaling_factor
,
offset
in
scaling_factor_to_offset
.
items
():
assert
offset
==
next_expected_offset
next_expected_offset
+=
scaling_factor
*
max_position
for
i
in
range
(
len
(
scaling_factors
)):
long_lora_context
.
offsets_by_lora_id
[
i
]
=
scaling_factor_to_offset
.
get
(
scaling_factors
[
i
],
0
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
,
long_lora_context
=
long_lora_context
,
)
# lora_rope.set_mapping(*mapping_info)
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
))
query
=
torch
.
randn
(
batch_size
,
seq_len
,
num_heads
*
head_size
,
dtype
=
dtype
)
key
=
torch
.
randn_like
(
query
)
ref_q
,
ref_k
=
linear_rope
(
positions
,
query
,
key
)
actual_q
,
actual_k
=
lora_rope
(
positions
,
query
,
key
)
torch
.
allclose
(
ref_q
,
actual_q
)
torch
.
allclose
(
ref_k
,
actual_k
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
)))
...
...
tests/lora/test_llama_tp.py
View file @
675ba75f
...
...
@@ -28,6 +28,14 @@ EXPECTED_LORA_OUTPUT = [
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
"[user] Write a SQL query to answer the question based on the table schema.
\n\n
context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)
\n\n
question: Name the ICAO for lilongwe international airport [/user] [assistant]"
,
# noqa: E501
...
...
@@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files):
print
(
"removing lora"
)
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# V1 Test: Failing due to numerics on V1.
@
pytest
.
mark
.
skip_v1
@
create_new_process_for_each_test
()
def
test_llama_lora
(
sql_lora_files
):
...
...
@@ -126,8 +124,6 @@ def test_llama_lora_warmup(sql_lora_files):
"less when using lora than when not using lora"
)
# V1 Test: Failing due to numerics on V1.
@
pytest
.
mark
.
skip_v1
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
def
test_llama_lora_tp4
(
sql_lora_files
):
...
...
@@ -157,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
enable_chunked_prefill
=
True
,
)
generate_and_test
(
llm
,
sql_lora_files
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
def
test_llama_lora_tp4_fully_sharded_enable_bias
(
sql_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
4
,
fully_sharded_loras
=
True
,
enable_lora_bias
=
True
,
enable_chunked_prefill
=
True
,
)
generate_and_test
(
llm
,
sql_lora_files
)
tests/lora/test_long_context.py
deleted
100644 → 0
View file @
5cc98918
# SPDX-License-Identifier: Apache-2.0
import
ast
from
typing
import
Optional
import
numpy
as
np
import
pytest
import
vllm
from
vllm
import
SamplingParams
from
vllm.lora.layers
import
LinearScalingRotaryEmbeddingWithLoRA
from
vllm.lora.request
import
LoRARequest
from
vllm.model_executor.layers.rotary_embedding
import
(
LinearScalingRotaryEmbedding
)
from
.data.long_context_test_data
import
prompts_and_responses
context_len_to_scaling_factor
=
{
"16k"
:
4
,
"32k"
:
8
,
}
# We use the same sampling params for all requests
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
100
,
)
def
_create_lora_request
(
lora_id
,
long_context_infos
):
context_len
=
long_context_infos
[
lora_id
][
"context_length"
]
scaling_factor
=
context_len_to_scaling_factor
[
context_len
]
return
LoRARequest
(
# There are 2 LoRAs for 16K, we need to add lora_id to indicate
# they are different LoRAs.
context_len
+
str
(
lora_id
),
lora_id
,
long_context_infos
[
lora_id
][
"lora"
],
None
,
4096
*
scaling_factor
,
)
def
evaluate_json_response
(
model_response
,
golden_response
):
"""Evaluates the model response against the golden response.
Returns a score between 0 and 1, where 1 is a perfect match and 0 is no
match. The score quantifies how well the model is able to extract the
golden JSON from the long context.
"""
try
:
model_response
=
ast
.
literal_eval
(
model_response
)
except
Exception
as
e
:
raise
ValueError
(
f
"Model response is not a valid JSON. Expected
{
golden_response
}
, "
f
"got
{
model_response
}
"
)
from
e
# Normally, we would flatten the dictionary and compare the values, but in
# this case, we know that the dictionary is only 2 levels deep
positive_values
=
0
total_values
=
0
# We look at all the attributes of the person that we are extracting a
# biography of and copmare them to the golden response
for
person_attribute
,
person_attribute_value
in
golden_response
.
items
():
if
person_attribute
in
model_response
:
if
isinstance
(
person_attribute_value
,
dict
):
for
(
sub_attribute
,
sub_attribute_value
)
in
person_attribute_value
.
items
():
total_values
+=
1
if
sub_attribute
in
model_response
[
person_attribute
]
and
model_response
[
person_attribute
][
sub_attribute
]
==
sub_attribute_value
:
positive_values
+=
1
else
:
total_values
+=
1
if
model_response
[
person_attribute
]
==
person_attribute_value
:
positive_values
+=
1
else
:
# We count a missing sub-dict as a single missed value.
total_values
+=
1
# Return a score between 0 and 1
return
positive_values
/
total_values
def
generate
(
llm
:
vllm
.
LLM
,
inputs
:
tuple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]],
):
prompts
,
sampling_param
,
lora_request
=
inputs
outputs
=
llm
.
generate
(
prompts
,
sampling_param
,
lora_request
=
lora_request
)
return
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
def
batched_generate
(
llm
:
vllm
.
LLM
,
inputs
:
list
[
tuple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]],
):
for
input
in
inputs
:
prompt
,
sampling_param
,
lora_req
=
input
# Add requests to the engine and run the engine
llm
.
_validate_and_add_requests
(
prompt
,
sampling_param
,
lora_request
=
lora_req
,
prompt_adapter_request
=
None
)
outputs
=
llm
.
_run_engine
(
use_tqdm
=
True
)
return
[
outputs
[
i
].
outputs
[
0
].
text
.
strip
()
for
i
in
range
(
len
(
outputs
))]
@
pytest
.
fixture
(
scope
=
"module"
)
def
lora_llm
(
long_context_infos
):
scaling_factors
=
[
context_len_to_scaling_factor
[
info
[
"context_length"
]]
for
info
in
long_context_infos
.
values
()
]
llm
=
vllm
.
LLM
(
"meta-llama/Llama-2-13b-chat-hf"
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
2
,
long_lora_scaling_factors
=
tuple
(
scaling_factors
),
max_num_batched_tokens
=
4096
*
8
,
tensor_parallel_size
=
4
,
# FIXME enable async output processor
disable_async_output_proc
=
True
,
distributed_executor_backend
=
"mp"
,
enable_chunked_prefill
=
True
)
yield
llm
del
llm
def
test_rotary_emb_replaced
(
dist_init
):
"""Verify rotary emb in all the layers are replaced"""
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.worker.model_runner
import
ModelRunner
engine_args
=
EngineArgs
(
"meta-llama/Llama-2-7b-hf"
,
long_lora_scaling_factors
=
(
4.0
,
),
enable_lora
=
True
)
engine_config
=
engine_args
.
create_engine_config
()
model_runner
=
ModelRunner
(
vllm_config
=
engine_config
,
is_driver_worker
=
True
,
)
model_runner
.
load_model
()
rotary_emb_count
=
0
for
module_name
,
module
in
model_runner
.
model
.
named_modules
(
remove_duplicate
=
False
):
if
"rotary_emb"
in
module_name
:
if
"base_layer"
not
in
module_name
:
rotary_emb_count
+=
1
assert
isinstance
(
module
,
LinearScalingRotaryEmbeddingWithLoRA
)
else
:
assert
isinstance
(
module
,
LinearScalingRotaryEmbedding
)
# Llama 2 has 32 layers.
assert
rotary_emb_count
==
32
@
pytest
.
mark
.
skip_global_cleanup
def
test_batched_rope_kernel
(
lora_llm
,
long_context_infos
):
"""We test the batched kernel by comparing the results of batched an
non-batched generation.
"""
# Create non batched results first to compare against batched results
non_batched_results
:
list
[
str
]
=
[]
for
lora_id
,
info
in
long_context_infos
.
items
():
context_len
=
info
[
"context_length"
]
lora_prompt
=
(
prompts_and_responses
[
context_len
][
0
][
"prompt"
],
sampling_params
,
_create_lora_request
(
lora_id
,
long_context_infos
))
lora_output
=
generate
(
lora_llm
,
lora_prompt
)
non_batched_results
.
append
(
lora_output
)
# Create batched results
# Each element of the batch must be
# (prompt, prompt_sampling_params, prompt_lora_request)
batched_prompts
:
list
[
tuple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]
=
[]
for
lora_id
,
info
in
long_context_infos
.
items
():
context_len
=
info
[
"context_length"
]
batched_prompts
.
extend
([
(
prompts_and_responses
[
context_len
][
0
][
"prompt"
],
sampling_params
,
_create_lora_request
(
lora_id
,
long_context_infos
))
])
batched_results
=
batched_generate
(
lora_llm
,
batched_prompts
)
# Results should be the same
for
non_batched
,
batched
in
zip
(
non_batched_results
,
batched_results
):
assert
non_batched
==
batched
,
(
"Non batched and batched results should be the "
f
"same:
\n
{
batched
}
\n
{
non_batched
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_self_consistency
(
lora_llm
,
long_context_infos
):
"""We test consistency of the batched kernel by permuting batched
inputs and comparing the results to the non-permuted batched results.
"""
num_loras
=
len
(
long_context_infos
)
# Create results in order of long_context_infos
batched_prompts
:
list
[
tuple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]
=
[]
for
lora_id
,
info
in
long_context_infos
.
items
():
context_len
=
info
[
"context_length"
]
batched_prompts
.
extend
([
(
prompts_and_responses
[
context_len
][
0
][
"prompt"
],
sampling_params
,
_create_lora_request
(
lora_id
,
long_context_infos
))
])
batched_results
=
batched_generate
(
lora_llm
,
batched_prompts
)
permutation
=
np
.
random
.
default_rng
(
seed
=
42
).
permutation
(
num_loras
)
# Create results in random order of permutation
batched_prompts
=
[]
for
i
in
permutation
:
lora_id
,
info
=
list
(
long_context_infos
.
items
())[
i
]
context_len
=
info
[
"context_length"
]
batched_prompts
.
extend
([
(
prompts_and_responses
[
context_len
][
0
][
"prompt"
],
sampling_params
,
_create_lora_request
(
lora_id
,
long_context_infos
))
])
permutated_batched_results
=
batched_generate
(
lora_llm
,
batched_prompts
)
# Results should be the same
for
i
in
range
(
num_loras
):
assert
batched_results
[
i
]
==
permutated_batched_results
[
permutation
[
i
]],
(
f
"Results should be the same:
\n
{
batched_results
[
i
]
}
"
f
"
\n
{
permutated_batched_results
[
permutation
[
i
]]
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_quality
(
lora_llm
,
long_context_infos
):
"""We test the quality of the answers given by the LoRA model by
comparing the generated text to the merged model's outputs.
This is effectively a mini-benchmark over four prompts.
If this test fails, this indicates that the quality of the LoRA model
is suboptimal compared to the merged model. For example, if the model
does not output valid dictionaries, this test will fail.
If needed for testing, the merged versions of the models are available
as part of the `conftest`.
The test is expected to run for about 1 minute on a p4de.24xlarge
instance.
"""
scores
:
list
[
float
]
=
[]
for
lora_id
,
info
in
long_context_infos
.
items
():
context_len
=
info
[
"context_length"
]
for
prompt_and_response
in
prompts_and_responses
[
context_len
]:
lora_prompt
=
(
prompt_and_response
[
"prompt"
],
sampling_params
,
_create_lora_request
(
lora_id
,
long_context_infos
))
response
=
generate
(
lora_llm
,
lora_prompt
)
golden_answer
=
prompt_and_response
[
"golden_answer"
]
score
=
evaluate_json_response
(
response
,
golden_answer
)
scores
.
append
(
score
)
assert
score
>
0.3
,
(
"Quality of the answer is not good enough. "
f
"Expected
{
golden_answer
}
, got
{
response
}
"
)
assert
np
.
mean
(
scores
)
>
0.5
@
pytest
.
mark
.
skip_global_cleanup
def
test_max_len
(
lora_llm
,
long_context_infos
):
"""Test that we raise an ValueError when the input of a given LoRA
model exceeds the maximum length."""
# Since each LoRA model has a different maximum length, we need to
# test each one separately
for
lora_id
,
info
in
long_context_infos
.
items
():
context_len
=
info
[
"context_length"
]
lora_request
=
_create_lora_request
(
lora_id
,
long_context_infos
)
# Good prompt should be fine
good_prompt
=
prompts_and_responses
[
context_len
][
0
][
"prompt"
]
generate
(
lora_llm
,
(
good_prompt
,
sampling_params
,
lora_request
))
# Bad prompt should raise an error
bad_prompt
=
good_prompt
*
2
with
pytest
.
raises
(
ValueError
):
generate
(
lora_llm
,
(
bad_prompt
,
sampling_params
,
lora_request
))
# Also test batched
batched_prompts
:
list
[
tuple
[
str
,
SamplingParams
,
Optional
[
LoRARequest
]]]
=
[]
for
lora_id_with_bad_inputs
in
long_context_infos
:
for
lora_id
,
info
in
long_context_infos
.
items
():
context_len
=
info
[
"context_length"
]
batched_prompts
.
extend
([
(
prompts_and_responses
[
context_len
][
0
][
"prompt"
]
*
(
2
if
lora_id
==
lora_id_with_bad_inputs
else
1
),
sampling_params
,
_create_lora_request
(
lora_id
,
long_context_infos
))
])
# Turn good prompt into bad prompt inside of batched prompts
with
pytest
.
raises
(
ValueError
):
batched_generate
(
lora_llm
,
batched_prompts
)
tests/lora/test_lora_manager.py
View file @
675ba75f
...
...
@@ -7,7 +7,6 @@ import torch
from
safetensors.torch
import
load_file
from
torch
import
nn
from
vllm
import
envs
from
vllm.config
import
LoRAConfig
from
vllm.lora.layers
import
(
ColumnParallelLinearWithLoRA
,
MergedColumnParallelLinearWithLoRA
,
...
...
@@ -33,6 +32,17 @@ DEVICES = ([
]
if
current_platform
.
is_cuda_alike
()
else
[
"cpu"
])
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Some tests depend on V0 internals. Since both V0 and V1 use the same
LoRAModelManager it is okay to just test V0.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_from_lora_tensors
(
sql_lora_files
,
device
):
tensors
=
load_file
(
...
...
@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert
manager
.
device
==
device
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_USE_V1
,
reason
=
"Test leverages V0 internals."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lru_cache_worker_adapter_manager
(
llama_2_7b_model_extra_embeddings
,
sql_lora_files
,
device
):
...
...
@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device
)
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_USE_V1
,
reason
=
"Test leverages V0 internals."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_worker_adapter_manager
(
llama_2_7b_model_extra_embeddings
,
sql_lora_files
,
device
):
...
...
tests/lora/test_minicpmv_tp.py
View file @
675ba75f
...
...
@@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"MiniCPM-V dependency xformers incompatible with ROCm"
)
@
create_new_process_for_each_test
()
def
test_minicpmv_lora
(
minicpmv_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
...
...
@@ -78,6 +77,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
assert
EXPECTED_OUTPUT
[
i
].
startswith
(
output2
[
i
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_cuda_alike
(),
reason
=
"Skipping to avoid redundant model tests"
)
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"MiniCPM-V dependency xformers incompatible with ROCm"
)
...
...
@@ -99,6 +100,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
assert
EXPECTED_OUTPUT
[
i
].
startswith
(
output_tp
[
i
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_cuda_alike
(),
reason
=
"Skipping to avoid redundant model tests"
)
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"MiniCPM-V dependency xformers incompatible with ROCm"
)
...
...
tests/lora/test_phi.py
View file @
675ba75f
...
...
@@ -10,6 +10,14 @@ MODEL_PATH = "microsoft/phi-2"
PROMPT_TEMPLATE
=
"### Instruct: {sql_prompt}
\n\n
### Context: {context}
\n\n
### Output:"
# noqa: E501
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
PROMPT_TEMPLATE
.
format
(
...
...
@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@
pytest
.
mark
.
skip_v1
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment