Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fcfc474d
Commit
fcfc474d
authored
Apr 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-dev
parents
bb94d2e5
296c6572
Changes
503
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
989 additions
and
375 deletions
+989
-375
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+69
-0
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+138
-0
tests/kernels/test_cutlass_moe.py
tests/kernels/test_cutlass_moe.py
+244
-0
tests/kernels/test_gguf.py
tests/kernels/test_gguf.py
+2
-2
tests/kernels/test_lightning_attn.py
tests/kernels/test_lightning_attn.py
+286
-0
tests/kernels/test_mla_decode_cpu.py
tests/kernels/test_mla_decode_cpu.py
+94
-0
tests/kernels/test_moe.py
tests/kernels/test_moe.py
+19
-6
tests/kernels/test_prefix_prefill.py
tests/kernels/test_prefix_prefill.py
+4
-0
tests/kernels/test_uva.py
tests/kernels/test_uva.py
+61
-0
tests/kernels/untest_ggml.py
tests/kernels/untest_ggml.py
+2
-1
tests/lora/conftest.py
tests/lora/conftest.py
+0
-58
tests/lora/data/long_context_test_data.py
tests/lora/data/long_context_test_data.py
+0
-121
tests/lora/test_baichuan.py
tests/lora/test_baichuan.py
+0
-8
tests/lora/test_chatglm3_tp.py
tests/lora/test_chatglm3_tp.py
+8
-8
tests/lora/test_layers.py
tests/lora/test_layers.py
+23
-122
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+8
-29
tests/lora/test_lora_manager.py
tests/lora/test_lora_manager.py
+11
-3
tests/lora/test_minicpmv_tp.py
tests/lora/test_minicpmv_tp.py
+4
-1
tests/lora/test_phi.py
tests/lora/test_phi.py
+8
-8
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+8
-8
No files found.
tests/kernels/test_cache.py
View file @
fcfc474d
...
...
@@ -752,3 +752,72 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
ops
.
gather_cache
(
src_cache
,
dst
,
block_table
,
cu_seq_lens
,
batch_size
)
torch
.
testing
.
assert_close
(
dst
,
expected
)
@
pytest
.
mark
.
parametrize
(
"kv_lora_rank"
,
KV_LORA_RANKS
)
@
pytest
.
mark
.
parametrize
(
"qk_rope_head_dim"
,
QK_ROPE_HEAD_DIMS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS_MLA
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES_MLA
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS_MLA
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
(),
reason
=
"CPU only"
)
@
torch
.
inference_mode
()
def
test_concat_and_cache_mla_cpu
(
kv_lora_rank
:
int
,
qk_rope_head_dim
:
int
,
num_tokens
:
int
,
block_size
:
int
,
num_blocks
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
device
=
"cpu"
kv_cache_dtype
=
"auto"
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
total_slots
=
num_blocks
*
block_size
slot_mapping_lst
=
random
.
sample
(
range
(
total_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping_lst
,
dtype
=
torch
.
long
,
device
=
device
)
kv_c
=
torch
.
randn
(
num_tokens
,
kv_lora_rank
,
dtype
=
dtype
,
device
=
device
)
k_pe
=
torch
.
randn
(
num_tokens
,
qk_rope_head_dim
,
dtype
=
dtype
,
device
=
device
)
entry_size
=
kv_lora_rank
+
qk_rope_head_dim
scale
=
torch
.
tensor
(
0.1
,
dtype
=
torch
.
float32
,
device
=
device
)
kv_cache
=
_create_mla_cache
(
num_blocks
,
block_size
,
entry_size
,
dtype
,
kv_cache_dtype
,
device
)
ref_temp
=
torch
.
zeros
(
*
kv_cache
.
shape
,
dtype
=
dtype
,
device
=
device
)
for
i
in
range
(
num_tokens
):
slot
=
slot_mapping
[
i
].
item
()
block_idx
=
slot
//
block_size
block_offset
=
slot
%
block_size
ref_temp
[
block_idx
,
block_offset
,
:
kv_lora_rank
]
=
kv_c
[
i
]
ref_temp
[
block_idx
,
block_offset
,
kv_lora_rank
:]
=
k_pe
[
i
]
if
kv_cache_dtype
==
"fp8"
:
ref_kv_cache
=
torch
.
empty_like
(
ref_temp
,
dtype
=
kv_cache
.
dtype
)
ops
.
convert_fp8
(
ref_kv_cache
,
ref_temp
,
scale
.
item
(),
kv_dtype
=
kv_cache_dtype
)
else
:
ref_kv_cache
=
ref_temp
opcheck
(
torch
.
ops
.
_C_cache_ops
.
concat_and_cache_mla
,
(
kv_c
,
k_pe
,
kv_cache
,
slot_mapping
,
kv_cache_dtype
,
scale
),
test_utils
=
DEFAULT_OPCHECK_TEST_UTILS
,
)
ops
.
concat_and_cache_mla
(
kv_c
,
k_pe
,
kv_cache
,
slot_mapping
,
kv_cache_dtype
,
scale
)
torch
.
testing
.
assert_close
(
kv_cache
,
ref_kv_cache
)
tests/kernels/test_cutlass.py
View file @
fcfc474d
...
...
@@ -3,6 +3,7 @@
Run `pytest tests/kernels/test_cutlass.py`.
"""
import
random
import
pytest
import
torch
...
...
@@ -499,3 +500,140 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
torch
.
float16
)
#print("out:",out)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
# def test_cutlass_support_opcheck():
# opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
# @pytest.mark.parametrize("num_experts", [8, 64])
# @pytest.mark.parametrize("per_act_token", [True, False])
# @pytest.mark.parametrize("per_out_ch", [True, False])
# @pytest.mark.parametrize("use_bias", [False])
# @pytest.mark.skipif(
# (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
# current_platform.get_device_capability()),
# reason="Grouped gemm is not supported on this GPU type.")
# def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
per_out_ch
:
bool
,
use_bias
:
bool
):
# Device and dtype setup
device
=
"cuda"
out_dtype
=
torch
.
half
# Create separate A, B, C tensors for each group
a_tensors
=
[]
b_tensors
=
[]
a_scales_tensors
=
[]
b_scales_tensors
=
[]
baseline_tensors
=
[]
expert_offsets
=
torch
.
zeros
((
num_experts
+
1
),
device
=
device
,
dtype
=
torch
.
int32
)
problem_sizes
=
torch
.
zeros
((
num_experts
,
3
),
device
=
device
,
dtype
=
torch
.
int32
)
if
not
per_act_token
:
one_scale_a
=
torch
.
randn
((
1
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
alignment
=
16
# 128 // 8
# For variation, each group has dimensions
n_g
=
alignment
*
random
.
randint
(
1
,
64
)
k_g
=
alignment
*
random
.
randint
(
1
,
64
)
for
g
in
range
(
num_experts
):
m_g
=
alignment
*
random
.
randint
(
1
,
64
)
expert_offsets
[
g
+
1
]
=
expert_offsets
[
g
]
+
m_g
problem_sizes
[
g
][
0
]
=
m_g
problem_sizes
[
g
][
1
]
=
n_g
problem_sizes
[
g
][
2
]
=
k_g
m_a_scales
=
m_g
if
per_act_token
else
1
n_b_scales
=
n_g
if
per_out_ch
else
1
print
(
"shape:"
,
m_g
,
n_g
,
k_g
)
# Create group-specific A and B (FP8) and output (FP16/FP32)
a_g
=
to_fp8
(
torch
.
randn
((
m_g
,
k_g
),
device
=
device
))
b_g
=
to_fp8
(
torch
.
randn
((
n_g
,
k_g
),
device
=
device
).
t
())
a_tensors
.
append
(
a_g
)
b_tensors
.
append
(
b_g
)
# Set up A/B scales
scale_b
=
torch
.
randn
((
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
b_scales_tensors
.
append
(
scale_b
)
if
per_act_token
:
scale_a
=
torch
.
randn
((
m_a_scales
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
a_scales_tensors
.
append
(
scale_a
)
else
:
scale_a
=
one_scale_a
# Compute baseline result for this group
baseline_g
=
baseline_scaled_mm
(
a_g
,
b_g
,
scale_a
,
scale_b
,
out_dtype
,
None
)
baseline_tensors
.
append
(
baseline_g
)
a_tensors_stacked
=
torch
.
empty
((
expert_offsets
[
num_experts
],
k_g
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
)
b_tensors_stacked
=
torch
.
empty
((
num_experts
,
n_g
,
k_g
),
device
=
device
,
dtype
=
torch
.
float8_e4m3fn
)
for
g
in
range
(
num_experts
):
a_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
=
a_tensors
[
g
]
b_tensors_stacked
[
g
]
=
b_tensors
[
g
].
t
()
b_tensors_stacked
=
b_tensors_stacked
.
transpose
(
1
,
2
)
if
per_act_token
:
a_scales_tensors_stacked
=
torch
.
empty
(
(
expert_offsets
[
num_experts
],
1
),
device
=
device
,
dtype
=
torch
.
float32
)
for
g
in
range
(
num_experts
):
a_scales_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
=
a_scales_tensors
[
g
]
else
:
a_scales_tensors_stacked
=
one_scale_a
b_scales_tensors_stacked
=
torch
.
empty
((
num_experts
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
for
g
in
range
(
num_experts
):
b_scales_tensors_stacked
[
g
]
=
b_scales_tensors
[
g
]
out_tensors_stacked
=
torch
.
zeros
((
expert_offsets
[
num_experts
],
n_g
),
device
=
device
,
dtype
=
out_dtype
)
ab_strides
=
torch
.
full
((
num_experts
,
),
a_tensors_stacked
.
stride
(
0
),
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides
=
torch
.
full
((
num_experts
,
),
out_tensors_stacked
.
stride
(
0
),
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ops
.
cutlass_moe_mm
(
out_tensors_stacked
,
a_tensors_stacked
,
b_tensors_stacked
,
a_scales_tensors_stacked
,
b_scales_tensors_stacked
,
expert_offsets
[:
-
1
],
problem_sizes
,
ab_strides
,
ab_strides
,
c_strides
)
# Validate each group's result against the baseline
for
g
in
range
(
num_experts
):
baseline
=
baseline_tensors
[
g
]
c
=
out_tensors_stacked
[
expert_offsets
[
g
]:
expert_offsets
[
g
+
1
]]
print
(
baseline
)
print
(
c
)
print
(
"*"
)
torch
.
testing
.
assert_close
(
c
,
baseline
,
rtol
=
1e-2
,
atol
=
5e-4
)
tests/kernels/test_cutlass_moe.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
cutlass_moe_fp8
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
fused_experts
,
fused_topk
)
from
vllm.platforms
import
current_platform
NUM_EXPERTS
=
[
40
,
64
]
TOP_KS
=
[
6
,
8
]
def
run
(
a
:
torch
.
Tensor
,
a_scale
:
torch
.
Tensor
,
w1_q
:
torch
.
Tensor
,
w2_q
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
ab_strides1
:
torch
.
Tensor
,
c_strides1
:
torch
.
Tensor
,
ab_strides2
:
torch
.
Tensor
,
c_strides2
:
torch
.
Tensor
):
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))):
return
cutlass_moe_fp8
(
a
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
,
a1_scale
=
a_scale
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
2
,
64
,
224
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
1024
,
3072
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1024
,
1536
])
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
current_platform
.
get_device_capability
()),
reason
=
"Grouped gemm is not supported on this GPU type."
)
def
test_cutlass_moe_no_graph
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
):
current_platform
.
seed_everything
(
7
)
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))):
dtype
=
torch
.
half
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
# Get the right scale for tests.
_
,
a_scale1
=
ops
.
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
per_act_token
)
a_q
,
_
=
ops
.
scaled_fp8_quant
(
a
,
a_scale1
,
use_per_token_if_dynamic
=
per_act_token
)
a_d
=
a_q
.
float
().
mul
(
a_scale1
).
to
(
dtype
)
n_b_scales
=
2
*
n
if
per_out_ch
else
1
k_b_scales
=
k
if
per_out_ch
else
1
w1_q
=
torch
.
empty
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w2_q
=
torch
.
empty
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w1_scale
=
torch
.
empty
((
e
,
n_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
empty
((
e
,
k_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
for
expert
in
range
(
e
):
w1_q
[
expert
],
w1_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w2_q
[
expert
],
w2_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w1_q
=
w1_q
.
transpose
(
1
,
2
)
w2_q
=
w2_q
.
transpose
(
1
,
2
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
w1_d
=
torch
.
empty_like
(
w1
)
w2_d
=
torch
.
empty_like
(
w2
)
for
expert
in
range
(
e
):
w1_d
[
expert
]
=
(
w1_q
[
expert
].
t
().
float
()
*
w1_scale
[
expert
]).
half
()
w2_d
[
expert
]
=
(
w2_q
[
expert
].
t
().
float
()
*
w2_scale
[
expert
]).
half
()
score
=
torch
.
randn
((
m
,
e
),
device
=
"cuda"
,
dtype
=
dtype
)
topk_weights
,
topk_ids
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
triton_output
=
fused_experts
(
a_d
,
w1_d
,
w2_d
,
topk_weights
,
topk_ids
)
cutlass_output
=
cutlass_moe_fp8
(
a
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
,
a1_scale
=
a_scale1
)
#print(triton_output)
#print(cutlass_output)
#print("*")
torch
.
testing
.
assert_close
(
triton_output
,
cutlass_output
,
atol
=
5e-2
,
rtol
=
1e-2
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
2
,
64
,
224
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
1024
,
3072
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
1024
,
1536
])
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
current_platform
.
get_device_capability
()),
reason
=
"Grouped gemm is not supported on this GPU type."
)
def
test_cutlass_moe_cuda_graph
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
):
current_platform
.
seed_everything
(
7
)
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))):
dtype
=
torch
.
half
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
# Get the right scale for tests.
_
,
a_scale1
=
ops
.
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
per_act_token
)
a_q
,
_
=
ops
.
scaled_fp8_quant
(
a
,
a_scale1
,
use_per_token_if_dynamic
=
per_act_token
)
a_d
=
a_q
.
float
().
mul
(
a_scale1
).
to
(
dtype
)
n_b_scales
=
2
*
n
if
per_out_ch
else
1
k_b_scales
=
k
if
per_out_ch
else
1
w1_q
=
torch
.
empty
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w2_q
=
torch
.
empty
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
torch
.
float8_e4m3fn
)
w1_scale
=
torch
.
empty
((
e
,
n_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
empty
((
e
,
k_b_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
for
expert
in
range
(
e
):
w1_q
[
expert
],
w1_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w2_q
[
expert
],
w2_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
],
use_per_token_if_dynamic
=
per_out_ch
)
w1_q
=
w1_q
.
transpose
(
1
,
2
)
w2_q
=
w2_q
.
transpose
(
1
,
2
)
ab_strides1
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
e
,
),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
e
,
),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
e
,
),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
w1_d
=
torch
.
empty_like
(
w1
)
w2_d
=
torch
.
empty_like
(
w2
)
for
expert
in
range
(
e
):
w1_d
[
expert
]
=
(
w1_q
[
expert
].
t
().
float
()
*
w1_scale
[
expert
]).
half
()
w2_d
[
expert
]
=
(
w2_q
[
expert
].
t
().
float
()
*
w2_scale
[
expert
]).
half
()
score
=
torch
.
randn
((
m
,
e
),
device
=
"cuda"
,
dtype
=
dtype
)
topk_weights
,
topk_ids
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
triton_output
=
fused_experts
(
a_d
,
w1_d
,
w2_d
,
topk_weights
,
topk_ids
)
stream
=
torch
.
cuda
.
Stream
()
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
,
stream
=
stream
):
cutlass_output
=
run
(
a
,
a_scale1
,
w1_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
)
torch
.
cuda
.
synchronize
()
graph
.
replay
()
torch
.
cuda
.
synchronize
()
#print(triton_output)
#print(cutlass_output)
#print("*")
torch
.
testing
.
assert_close
(
triton_output
,
cutlass_output
,
atol
=
9e-2
,
rtol
=
1e-2
)
tests/kernels/test_gguf.py
View file @
fcfc474d
...
...
@@ -69,7 +69,7 @@ QUANT_TYPES = [
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
half
]
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
QUANT_TYPES
)
@
torch
.
inference_mode
()
def
test_dequantize
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
...
...
@@ -82,7 +82,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
ref_output
=
torch
.
tensor
(
dequantize
(
tensor
.
data
,
quant_type
),
device
=
"cuda"
).
to
(
dtype
)
output
=
ops
.
ggml_dequantize
(
torch
.
tensor
(
tensor
.
data
,
device
=
"cuda"
),
quant_type
,
*
list
(
shape
)
).
to
(
dtype
)
quant_type
,
*
list
(
shape
)
,
dtype
)
torch
.
testing
.
assert_close
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
4e-2
)
...
...
tests/kernels/test_lightning_attn.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.model_executor.layers.lightning_attn
import
(
linear_decode_forward_triton
)
from
vllm.platforms
import
current_platform
NUM_HEADS
=
[
4
,
8
]
HEAD_SIZES
=
[
64
]
BATCH_SIZES
=
[
1
,
2
]
SEQ_LENGTHS
=
[
16
]
DTYPES
=
[
torch
.
float32
]
def
reference_lightning_attention
(
q
,
k
,
v
,
ed
,
block_size
,
kv_history
):
"""Reference implementation of lightning attention core algorithm
The difference from the main implementation is that this processes
each step sequentially, instead of using parallelized triton kernels
"""
B
,
H
,
S
,
D
=
q
.
shape
E
=
v
.
shape
[
-
1
]
dtype
=
q
.
dtype
output
=
torch
.
zeros
((
B
,
H
,
S
,
E
),
dtype
=
dtype
,
device
=
q
.
device
)
# Use clone() to ensure an independent copy
if
kv_history
is
None
:
kv_cache
=
torch
.
zeros
((
B
,
H
,
D
,
E
),
dtype
=
dtype
,
device
=
q
.
device
)
else
:
kv_cache
=
kv_history
.
clone
()
# More efficient implementation
# Convert decay factors to matrix form
if
ed
.
dim
()
==
1
:
decay
=
torch
.
exp
(
-
ed
).
view
(
1
,
-
1
,
1
,
1
)
else
:
decay
=
torch
.
exp
(
-
ed
)
for
b
in
range
(
B
):
for
step
in
range
(
S
):
# Process all heads at once for this position
q_bs
=
q
[
b
,
:,
step
]
# [H, D]
k_bs
=
k
[
b
,
:,
step
]
# [H, D]
v_bs
=
v
[
b
,
:,
step
]
# [H, E]
# Calculate KV outer products for all heads
for
h
in
range
(
H
):
# Calculate KV outer product
kv_outer
=
torch
.
outer
(
k_bs
[
h
],
v_bs
[
h
])
# Update KV cache with decay
# Note: Using the same order as in the Triton kernel
kv_cache
[
b
,
h
]
=
decay
[
0
,
h
,
0
,
0
]
*
kv_cache
[
b
,
h
]
+
kv_outer
# Calculate attention output
output
[
b
,
h
,
step
]
=
torch
.
matmul
(
q_bs
[
h
],
kv_cache
[
b
,
h
])
# Match the shape returned by the actual implementation
# The actual implementation returns a tensor of shape [B, H, 2, D, E]
# where dimension 2 contains both KV and KV history
kv_reshaped
=
kv_cache
.
unsqueeze
(
2
)
# [B, H, 1, D, E]
final_kv_cache
=
torch
.
cat
([
kv_reshaped
,
kv_reshaped
],
dim
=
2
)
# [B, H, 2, D, E]
return
output
,
final_kv_cache
def
reference_linear_decode
(
q
,
k
,
v
,
kv_caches
,
slope_rate
,
slot_idx
):
"""Reference implementation: linear attention decode function"""
B
,
H
,
_
,
D
=
q
.
shape
output
=
torch
.
zeros
(
B
,
H
*
D
,
dtype
=
q
.
dtype
,
device
=
q
.
device
)
# Calculate decay factors once (more efficient)
decay
=
torch
.
exp
(
-
slope_rate
).
view
(
-
1
,
1
,
1
)
# [H, 1, 1]
# Process each batch
for
b
in
range
(
B
):
slot_id
=
slot_idx
[
b
].
item
()
# Skip padding positions
if
slot_id
==
-
1
:
continue
# Process all heads at once for this batch
q_b
=
q
[
b
,
:,
0
]
# [H, D]
k_b
=
k
[
b
,
:,
0
]
# [H, D]
v_b
=
v
[
b
,
:,
0
]
# [H, D]
# Process each attention head
for
h
in
range
(
H
):
# Get current query, key and value
q_bh
=
q_b
[
h
]
k_bh
=
k_b
[
h
]
v_bh
=
v_b
[
h
]
# Get cache
kv_cache_old
=
kv_caches
[
b
,
h
]
# Calculate new key-value outer product
kv_outer
=
torch
.
outer
(
k_bh
,
v_bh
)
# Apply decay and update cache
kv_new
=
kv_outer
+
decay
[
h
,
0
,
0
]
*
kv_cache_old
# Calculate output
out_h
=
torch
.
matmul
(
q_bh
,
kv_new
)
# Update output and cache
output
[
b
,
h
*
D
:(
h
+
1
)
*
D
]
=
out_h
kv_caches
[
b
,
h
]
=
kv_new
return
output
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_linear_decode_forward_triton
(
batch_size
:
int
,
num_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
current_platform
.
seed_everything
(
42
)
base
=
0.01
q
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
k
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
v
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
kv_caches
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
head_size
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
kv_caches_copy
=
kv_caches
.
clone
()
slope_rate
=
torch
.
zeros
(
num_heads
,
device
=
"cuda"
)
for
h
in
range
(
num_heads
):
slope_rate
[
h
]
=
0.1
*
(
h
+
1
)
slot_idx
=
torch
.
arange
(
batch_size
,
device
=
"cuda"
)
triton_output
=
linear_decode_forward_triton
(
q
,
k
,
v
,
kv_caches
,
slope_rate
,
slot_idx
)
reference_output
=
reference_linear_decode
(
q
,
k
,
v
,
kv_caches_copy
,
slope_rate
,
slot_idx
)
torch
.
testing
.
assert_close
(
triton_output
,
reference_output
,
rtol
=
1e-1
,
atol
=
1e-1
)
torch
.
testing
.
assert_close
(
kv_caches
,
kv_caches_copy
,
rtol
=
1e-1
,
atol
=
1e-1
)
assert
triton_output
.
shape
==
(
batch_size
,
num_heads
*
head_size
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_linear_decode_forward_triton_with_padding
(
num_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
current_platform
.
seed_everything
(
42
)
batch_size
=
4
base
=
0.01
q
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
k
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
v
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
1
,
head_size
,
dtype
=
dtype
)
kv_caches
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
head_size
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
kv_caches_copy
=
kv_caches
.
clone
()
slope_rate
=
torch
.
zeros
(
num_heads
,
device
=
"cuda"
)
for
h
in
range
(
num_heads
):
slope_rate
[
h
]
=
0.1
*
(
h
+
1
)
slot_idx
=
torch
.
tensor
([
0
,
1
,
-
1
,
2
],
device
=
"cuda"
)
triton_output
=
linear_decode_forward_triton
(
q
,
k
,
v
,
kv_caches
,
slope_rate
,
slot_idx
)
reference_output
=
reference_linear_decode
(
q
,
k
,
v
,
kv_caches_copy
,
slope_rate
,
slot_idx
)
padding_mask
=
(
slot_idx
!=
-
1
).
unsqueeze
(
1
).
expand
(
-
1
,
num_heads
*
head_size
)
triton_masked
=
triton_output
[
padding_mask
]
reference_masked
=
reference_output
[
padding_mask
]
atol
,
rtol
=
1.5e-1
,
1.5e-1
valid_indices
=
slot_idx
!=
-
1
for
i
in
range
(
batch_size
):
if
valid_indices
[
i
]
>
0
:
torch
.
testing
.
assert_close
(
kv_caches
[
i
],
kv_caches_copy
[
i
],
rtol
=
rtol
,
atol
=
atol
)
torch
.
testing
.
assert_close
(
triton_masked
,
reference_masked
,
rtol
=
rtol
,
atol
=
atol
)
assert
triton_output
.
shape
==
(
batch_size
,
num_heads
*
head_size
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
SEQ_LENGTHS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
()
def
test_lightning_attention_reference
(
batch_size
:
int
,
num_heads
:
int
,
head_size
:
int
,
seq_len
:
int
,
dtype
:
torch
.
dtype
,
):
torch
.
set_default_device
(
"cuda"
)
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed_all
(
42
)
current_platform
.
seed_everything
(
42
)
base
=
0.01
q
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
seq_len
,
head_size
,
dtype
=
dtype
)
k
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
seq_len
,
head_size
,
dtype
=
dtype
)
v
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
seq_len
,
head_size
,
dtype
=
dtype
)
ed
=
torch
.
zeros
(
num_heads
,
device
=
"cuda"
)
for
h
in
range
(
num_heads
):
ed
[
h
]
=
0.1
*
(
h
+
1
)
kv_history
=
base
*
torch
.
randn
(
batch_size
,
num_heads
,
head_size
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
kv_history_clone
=
kv_history
.
clone
()
ref_output
,
ref_kv_cache
=
reference_lightning_attention
(
q
,
k
,
v
,
ed
,
256
,
kv_history
)
from
vllm.model_executor.layers.lightning_attn
import
lightning_attention
actual_output
,
actual_kv_cache
=
lightning_attention
(
q
,
k
,
v
,
ed
,
256
,
kv_history_clone
)
atol
,
rtol
=
1.5e-1
,
1.5e-1
torch
.
testing
.
assert_close
(
ref_output
,
actual_output
,
rtol
=
rtol
,
atol
=
atol
)
torch
.
testing
.
assert_close
(
ref_kv_cache
,
actual_kv_cache
,
rtol
=
rtol
,
atol
=
atol
)
assert
ref_output
.
shape
==
(
batch_size
,
num_heads
,
seq_len
,
head_size
)
assert
ref_kv_cache
.
shape
==
actual_kv_cache
.
shape
tests/kernels/test_mla_decode_cpu.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
import
vllm._custom_ops
as
ops
from
vllm.platforms
import
current_platform
def
cdiv
(
a
,
b
):
return
(
a
+
b
-
1
)
//
b
def
ref_mla
(
out
:
Tensor
,
# (bs, num_heads, v_head_dim)
query
:
Tensor
,
# (bs, num_heads, head_dim)
kv_cache
:
Tensor
,
# (num_blocks, block_size, head_dim)
scale
:
float
,
block_tables
:
Tensor
,
# (bs, max_num_blocks)
seq_lens
:
Tensor
,
# (bs,)
):
bs
,
num_heads
,
v_head_dim
=
out
.
shape
head_dim
=
query
.
shape
[
2
]
for
i
in
range
(
bs
):
# gather and flatten KV-cache
kv
=
kv_cache
[
block_tables
[
i
]]
# (max_num_blocks, block_size, head_dim)
kv
=
kv
.
view
(
1
,
-
1
,
head_dim
)[:,
:
seq_lens
[
i
]]
# (1, seq_len, head_dim)
v
=
kv
[:,
:,
:
v_head_dim
]
q
=
query
[
i
].
view
(
num_heads
,
1
,
head_dim
)
o
=
F
.
scaled_dot_product_attention
(
q
,
kv
,
v
,
scale
=
scale
,
enable_gqa
=
True
)
out
[
i
]
=
o
.
view
(
num_heads
,
v_head_dim
)
return
out
@
pytest
.
mark
.
parametrize
(
"bs"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"mean_seq_len"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"h_q"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"d"
,
[
576
])
@
pytest
.
mark
.
parametrize
(
"dv"
,
[
512
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float
,
torch
.
half
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"varlen"
,
[
False
,
True
])
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
(),
reason
=
"CPU only"
)
def
test_mla_decode_cpu
(
bs
:
int
,
mean_seq_len
:
int
,
h_q
:
int
,
d
:
int
,
dv
:
int
,
block_size
:
int
,
dtype
:
torch
.
dtype
,
varlen
:
bool
,
):
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
0
)
scale
=
d
**
(
-
0.5
)
if
varlen
:
seq_lens
=
torch
.
empty
(
bs
).
normal_
(
mean_seq_len
,
mean_seq_len
/
2
)
seq_lens
=
seq_lens
.
clip
(
2
).
to
(
torch
.
int32
)
else
:
seq_lens
=
torch
.
full
((
bs
,
),
mean_seq_len
,
dtype
=
torch
.
int32
)
max_seq_len
=
seq_lens
.
max
().
item
()
seqlen_pad
=
cdiv
(
max_seq_len
,
256
)
*
256
# is this necessary?
q
=
torch
.
randn
(
bs
,
h_q
,
d
)
block_table
=
torch
.
arange
(
bs
*
seqlen_pad
//
block_size
,
dtype
=
torch
.
int32
)
block_table
=
block_table
.
view
(
bs
,
seqlen_pad
//
block_size
)
kv_cache
=
torch
.
randn
(
block_table
.
numel
(),
block_size
,
d
)
for
i
,
seq_len
in
enumerate
(
seq_lens
.
tolist
()):
kv_cache
.
view
(
bs
,
seqlen_pad
,
d
)[
i
,
seq_len
:]
=
float
(
"nan"
)
out_mla
=
q
.
new_zeros
(
bs
,
h_q
,
dv
)
ops
.
mla_decode_kvcache_cpu
(
out_mla
,
q
,
kv_cache
,
scale
,
block_table
,
seq_lens
)
out_ref
=
q
.
new_zeros
(
bs
,
h_q
,
dv
)
ref_mla
(
out_ref
,
q
,
kv_cache
,
scale
,
block_table
,
seq_lens
)
assert
not
out_mla
.
isnan
().
any
(),
"Likely read out of bounds"
torch
.
testing
.
assert_close
(
out_mla
,
out_ref
)
tests/kernels/test_moe.py
View file @
fcfc474d
...
...
@@ -3,7 +3,6 @@
Run `pytest tests/kernels/test_moe.py`.
"""
import
pytest
import
torch
from
torch.nn
import
Parameter
...
...
@@ -216,11 +215,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"padding"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
@
torch
.
inference_mode
()
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
,
padding
:
bool
):
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
,
padding
:
bool
,
use_rocm_aiter
:
bool
,
monkeypatch
):
"""Make sure our Mixtral MoE implementation agrees with the one from
huggingface."""
if
use_rocm_aiter
:
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
# Instantiate our and huggingface's MoE blocks
config
=
MixtralConfig
()
hf_moe
=
MixtralSparseMoeBlock
(
config
).
to
(
dtype
).
to
(
"cuda"
)
...
...
@@ -268,10 +273,18 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
torch
.
bfloat16
:
1e-2
,
}
torch
.
testing
.
assert_close
(
hf_states
.
flatten
(
0
,
1
),
vllm_states
,
rtol
=
mixtral_moe_tol
[
dtype
],
atol
=
mixtral_moe_tol
[
dtype
])
if
use_rocm_aiter
:
# The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501
torch
.
testing
.
assert_close
(
hf_states
.
flatten
(
0
,
1
),
vllm_states
,
rtol
=
0.01
,
atol
=
100
)
else
:
torch
.
testing
.
assert_close
(
hf_states
.
flatten
(
0
,
1
),
vllm_states
,
rtol
=
mixtral_moe_tol
[
dtype
],
atol
=
mixtral_moe_tol
[
dtype
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
...
...
tests/kernels/test_prefix_prefill.py
View file @
fcfc474d
...
...
@@ -167,6 +167,7 @@ def test_contexted_kv_attention(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
@@ -183,6 +184,7 @@ def test_contexted_kv_attention(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
@@ -401,6 +403,7 @@ def test_contexted_kv_attention_alibi(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
@@ -417,6 +420,7 @@ def test_contexted_kv_attention_alibi(
block_table
,
b_start_loc
,
b_seq_len
,
MAX_CTX_LEN
,
max_input_len
,
k_scale
,
v_scale
,
...
...
tests/kernels/test_uva.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.utils
import
get_cuda_view_from_cpu_tensor
,
is_uva_available
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
skipif
(
not
is_uva_available
(),
reason
=
"UVA is not available."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_cpu_write
(
device
):
torch
.
set_default_device
(
device
)
cpu_tensor
=
torch
.
zeros
(
10
,
10
,
device
=
"cpu"
,
pin_memory
=
True
,
dtype
=
torch
.
int32
)
cuda_view
=
get_cuda_view_from_cpu_tensor
(
cpu_tensor
)
assert
cuda_view
.
device
.
type
==
"cuda"
assert
cuda_view
[
0
,
0
]
==
0
assert
cuda_view
[
2
,
3
]
==
0
assert
cuda_view
[
4
,
5
]
==
0
cpu_tensor
[
0
,
0
]
=
1
cpu_tensor
[
2
,
3
]
=
2
cpu_tensor
[
4
,
5
]
=
-
1
cuda_view
.
mul_
(
2
)
assert
cuda_view
[
0
,
0
]
==
2
assert
cuda_view
[
2
,
3
]
==
4
assert
cuda_view
[
4
,
5
]
==
-
2
@
pytest
.
mark
.
skipif
(
not
is_uva_available
(),
reason
=
"UVA is not available."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_gpu_write
(
device
):
torch
.
set_default_device
(
device
)
cpu_tensor
=
torch
.
zeros
(
10
,
10
,
device
=
"cpu"
,
pin_memory
=
True
,
dtype
=
torch
.
int32
)
cuda_view
=
get_cuda_view_from_cpu_tensor
(
cpu_tensor
)
assert
cuda_view
.
device
.
type
==
"cuda"
assert
cuda_view
[
0
,
0
]
==
0
assert
cuda_view
[
2
,
3
]
==
0
assert
cuda_view
[
4
,
5
]
==
0
cuda_view
[
0
,
0
]
=
1
cuda_view
[
2
,
3
]
=
2
cuda_view
[
4
,
5
]
=
-
1
cuda_view
.
mul_
(
2
)
assert
cpu_tensor
[
0
,
0
]
==
2
assert
cpu_tensor
[
2
,
3
]
==
4
assert
cpu_tensor
[
4
,
5
]
==
-
2
\ No newline at end of file
tests/kernels/untest_ggml.py
View file @
fcfc474d
...
...
@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
qweight
=
torch
.
randint
(
0
,
100
,
shape
,
device
=
'cuda'
,
dtype
=
torch
.
uint8
)
m
=
qweight
.
shape
[
0
]
n
=
qweight
.
shape
[
1
]
//
type_size
*
block_size
opcheck
(
torch
.
ops
.
_C
.
ggml_dequantize
,
(
qweight
,
quant_type
,
m
,
n
))
opcheck
(
torch
.
ops
.
_C
.
ggml_dequantize
,
(
qweight
,
quant_type
,
m
,
n
,
torch
.
float16
))
x
=
torch
.
rand
((
m
,
512
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
opcheck
(
torch
.
ops
.
_C
.
ggml_mul_mat_a8
,
...
...
tests/lora/conftest.py
View file @
fcfc474d
...
...
@@ -2,7 +2,6 @@
import
tempfile
from
collections
import
OrderedDict
from
typing
import
TypedDict
from
unittest.mock
import
MagicMock
,
patch
import
pytest
...
...
@@ -28,28 +27,6 @@ from vllm.platforms import current_platform
from
..utils
import
models_path_prefix
class
ContextIDInfo
(
TypedDict
):
lora_id
:
int
context_length
:
str
class
ContextInfo
(
TypedDict
):
lora
:
str
context_length
:
str
LONG_LORA_INFOS
:
list
[
ContextIDInfo
]
=
[{
"lora_id"
:
1
,
"context_length"
:
"16k"
,
},
{
"lora_id"
:
2
,
"context_length"
:
"16k"
,
},
{
"lora_id"
:
3
,
"context_length"
:
"32k"
,
}]
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
...
...
@@ -256,41 +233,6 @@ def long_context_lora_files_16k_1():
return
os
.
path
.
join
(
models_path_prefix
,
"SangBinCho/long_context_16k_testing_1"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
long_context_lora_files_16k_2
():
# return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
return
os
.
path
.
join
(
models_path_prefix
,
"SangBinCho/long_context_16k_testing_2"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
long_context_lora_files_32k
():
# return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
return
os
.
path
.
join
(
models_path_prefix
,
"SangBinCho/long_context_32k_testing"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
long_context_infos
(
long_context_lora_files_16k_1
,
long_context_lora_files_16k_2
,
long_context_lora_files_32k
):
cleanup_dist_env_and_memory
(
shutdown_ray
=
True
)
infos
:
dict
[
int
,
ContextInfo
]
=
{}
for
lora_checkpoint_info
in
LONG_LORA_INFOS
:
lora_id
=
lora_checkpoint_info
[
"lora_id"
]
if
lora_id
==
1
:
lora
=
long_context_lora_files_16k_1
elif
lora_id
==
2
:
lora
=
long_context_lora_files_16k_2
elif
lora_id
==
3
:
lora
=
long_context_lora_files_32k
else
:
raise
AssertionError
(
"Unknown lora id"
)
infos
[
lora_id
]
=
{
"context_length"
:
lora_checkpoint_info
[
"context_length"
],
"lora"
:
lora
,
}
return
infos
@
pytest
.
fixture
def
llama_2_7b_engine_extra_embeddings
():
cleanup_dist_env_and_memory
(
shutdown_ray
=
True
)
...
...
tests/lora/data/long_context_test_data.py
deleted
100644 → 0
View file @
bb94d2e5
This source diff could not be displayed because it is too large. You can
view the blob
instead.
tests/lora/test_baichuan.py
View file @
fcfc474d
...
...
@@ -43,14 +43,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
test_baichuan_lora
(
baichuan_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
...
...
tests/lora/test_chatglm3_tp.py
View file @
fcfc474d
...
...
@@ -20,6 +20,14 @@ EXPECTED_LORA_OUTPUT = [
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
PROMPT_TEMPLATE
.
format
(
query
=
"How many singers do we have?"
),
...
...
@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
create_new_process_for_each_test
()
def
test_chatglm3_lora
(
chatglm3_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
...
...
tests/lora/test_layers.py
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
importlib
import
random
from
copy
import
deepcopy
from
dataclasses
import
dataclass
...
...
@@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.lora.layers
import
(
BaseLayerWithLoRA
,
ColumnParallelLinearWithLoRA
,
LinearScalingRotaryEmbeddingWithLoRA
,
LogitsProcessorWithLoRA
,
LoRAMapping
,
MergedColumnParallelLinearWithLoRA
,
MergedQKVParallelLinearWithLoRA
,
...
...
@@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA
,
VocabParallelEmbeddingWithLoRA
)
# yapf: enable
from
vllm.lora.models
import
(
LongContextLoRAContext
,
LoRALayerWeights
,
PackedLoRALayerWeights
)
from
vllm.lora.models
import
LoRALayerWeights
,
PackedLoRALayerWeights
from
vllm.lora.punica_wrapper
import
get_punica_wrapper
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
...
...
@@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
get_masked_input_and_mask
)
from
vllm.model_executor.utils
import
set_random_seed
...
...
@@ -60,32 +56,16 @@ DEVICES = ([
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
if
current_platform
.
is_cuda_alike
()
else
[
"cpu"
])
#For GPU, we will launch different triton kernels between the prefill and decode
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
# prefill stage(True) or decode stage(False)
STAGES
=
[
True
,
False
]
# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
# the tests in this file run twice, once with the V0 engine and then with
# the V1 engine.
# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
# with the inclusion of V1 tests to maintain the CI test times.
NUM_RANDOM_SEEDS
=
5
# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
# 256 before. It is cut to half with the inclusion of V1 tests to maintain
# the CI test times.
NUM_RANDOM_SEEDS
=
6
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
=
128
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
# Reload punica_gpu as the kernels used are tied to engine type.
from
vllm.lora.punica_wrapper
import
punica_gpu
importlib
.
reload
(
punica_gpu
)
def
clean_cache
():
# Release any memory we might be holding on to. CI runs OOMs otherwise.
from
vllm.lora.ops.triton_ops.utils
import
(
_LORA_A_PTR_DICT
,
_LORA_B_PTR_DICT
)
...
...
@@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora):
yield
@
pytest
.
fixture
(
autouse
=
True
)
def
skip_cuda_with_stage_false
(
request
):
"""
On cuda-like platforms, we use the same kernels for prefill and decode
stage, and 'stage' is generally ignored, so we only need to test once.
"""
if
current_platform
.
is_cuda_alike
():
try
:
if
hasattr
(
request
.
node
,
"callspec"
)
and
hasattr
(
request
.
node
.
callspec
,
"params"
):
params
=
request
.
node
.
callspec
.
params
if
"stage"
in
params
and
params
[
"stage"
]
is
False
:
pytest
.
skip
(
"Skip test when stage=False"
)
except
Exception
:
pass
yield
def
get_random_id_to_index
(
num_loras
:
int
,
num_slots
:
int
,
log
:
bool
=
True
)
->
list
[
Optional
[
int
]]:
...
...
@@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
atol
=
atol
)
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"scaling_factors"
,
[(
1.0
,
),
(
4.0
,
),
(
4.0
,
8.0
),
(
6.0
,
1.0
)])
@
pytest
.
mark
.
parametrize
(
"max_position"
,
[
11
,
4096
,
32768
])
@
pytest
.
mark
.
parametrize
(
"is_neox_style"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"rotary_dim"
,
[
None
,
32
])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
32
,
108
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
11
,
1024
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda_alike
(),
reason
=
"Only CUDA backends are supported"
)
def
test_rotary_embedding_long_context
(
dist_init
,
num_loras
,
device
,
scaling_factors
,
max_position
,
is_neox_style
,
rotary_dim
,
head_size
,
seq_len
)
->
None
:
dtype
=
torch
.
float16
max_loras
=
8
seed
=
0
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
get_punica_wrapper
(
8192
,
256
,
device
,
max_loras
=
max_loras
)
assert
check_punica_wrapper
(
punica_wrapper
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
long_lora_scaling_factors
=
scaling_factors
,
lora_dtype
=
dtype
)
if
rotary_dim
is
None
:
rotary_dim
=
head_size
base
=
10000
batch_size
=
5
*
num_loras
num_heads
=
7
# Verify lora is equivalent to linear scaling rotary embedding.
rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
)
lora_rope
=
LinearScalingRotaryEmbeddingWithLoRA
(
rope
)
lora_rope
.
set_mapping
(
punica_wrapper
)
lora_rope
.
create_lora_weights
(
max_loras
,
lora_config
)
linear_rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
{
"rope_type"
:
"linear"
,
"factor"
:
scaling_factors
})
linear_rope
=
linear_rope
.
to
(
dtype
=
dtype
)
id_to_index
=
get_random_id_to_index
(
num_loras
,
max_loras
)
_
,
index_mapping
,
prompt_mapping
=
create_random_inputs
(
active_lora_ids
=
[
0
],
num_inputs
=
batch_size
,
input_size
=
(
1
,
max_position
),
input_range
=
(
0
,
lora_config
.
lora_extra_vocab_size
),
input_type
=
torch
.
float16
,
device
=
device
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
long_lora_context
=
LongContextLoRAContext
(
list
(
scaling_factors
),
rotary_dim
)
next_expected_offset
=
0
# Make sure the offset is correct.
scaling_factor_to_offset
=
lora_rope
.
scaling_factor_to_offset
for
scaling_factor
,
offset
in
scaling_factor_to_offset
.
items
():
assert
offset
==
next_expected_offset
next_expected_offset
+=
scaling_factor
*
max_position
for
i
in
range
(
len
(
scaling_factors
)):
long_lora_context
.
offsets_by_lora_id
[
i
]
=
scaling_factor_to_offset
.
get
(
scaling_factors
[
i
],
0
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
,
long_lora_context
=
long_lora_context
,
)
# lora_rope.set_mapping(*mapping_info)
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
))
query
=
torch
.
randn
(
batch_size
,
seq_len
,
num_heads
*
head_size
,
dtype
=
dtype
)
key
=
torch
.
randn_like
(
query
)
ref_q
,
ref_k
=
linear_rope
(
positions
,
query
,
key
)
actual_q
,
actual_k
=
lora_rope
(
positions
,
query
,
key
)
torch
.
allclose
(
ref_q
,
actual_q
)
torch
.
allclose
(
ref_k
,
actual_k
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
)))
...
...
tests/lora/test_llama_tp.py
View file @
fcfc474d
...
...
@@ -29,6 +29,14 @@ EXPECTED_LORA_OUTPUT = [
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
"[user] Write a SQL query to answer the question based on the table schema.
\n\n
context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)
\n\n
question: Name the ICAO for lilongwe international airport [/user] [assistant]"
,
# noqa: E501
...
...
@@ -72,16 +80,6 @@ def generate_and_test(llm, sql_lora_files):
print
(
"removing lora"
)
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# V1 Test: Failing due to numerics on V1.
@
pytest
.
mark
.
skip_v1
@
create_new_process_for_each_test
()
def
test_llama_lora
(
sql_lora_files
):
...
...
@@ -127,8 +125,6 @@ def test_llama_lora_warmup(sql_lora_files):
"less when using lora than when not using lora"
)
# V1 Test: Failing due to numerics on V1.
@
pytest
.
mark
.
skip_v1
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
def
test_llama_lora_tp4
(
sql_lora_files
):
...
...
@@ -158,20 +154,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
enable_chunked_prefill
=
True
,
)
generate_and_test
(
llm
,
sql_lora_files
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
def
test_llama_lora_tp4_fully_sharded_enable_bias
(
sql_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
4
,
fully_sharded_loras
=
True
,
enable_lora_bias
=
True
,
enable_chunked_prefill
=
True
,
)
generate_and_test
(
llm
,
sql_lora_files
)
tests/lora/test_lora_manager.py
View file @
fcfc474d
...
...
@@ -7,7 +7,6 @@ import torch
from
safetensors.torch
import
load_file
from
torch
import
nn
from
vllm
import
envs
from
vllm.config
import
LoRAConfig
from
vllm.lora.layers
import
(
ColumnParallelLinearWithLoRA
,
MergedColumnParallelLinearWithLoRA
,
...
...
@@ -33,6 +32,17 @@ DEVICES = ([
]
if
current_platform
.
is_cuda_alike
()
else
[
"cpu"
])
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Some tests depend on V0 internals. Since both V0 and V1 use the same
LoRAModelManager it is okay to just test V0.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_from_lora_tensors
(
sql_lora_files
,
device
):
tensors
=
load_file
(
...
...
@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert
manager
.
device
==
device
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_USE_V1
,
reason
=
"Test leverages V0 internals."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_lru_cache_worker_adapter_manager
(
llama_2_7b_model_extra_embeddings
,
sql_lora_files
,
device
):
...
...
@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device
)
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_USE_V1
,
reason
=
"Test leverages V0 internals."
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_worker_adapter_manager
(
llama_2_7b_model_extra_embeddings
,
sql_lora_files
,
device
):
...
...
tests/lora/test_minicpmv_tp.py
View file @
fcfc474d
...
...
@@ -60,7 +60,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"MiniCPM-V dependency xformers incompatible with ROCm"
)
@
create_new_process_for_each_test
()
def
test_minicpmv_lora
(
minicpmv_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
...
...
@@ -80,6 +79,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
assert
EXPECTED_OUTPUT
[
i
].
startswith
(
output2
[
i
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_cuda_alike
(),
reason
=
"Skipping to avoid redundant model tests"
)
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"MiniCPM-V dependency xformers incompatible with ROCm"
)
...
...
@@ -101,6 +102,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
assert
EXPECTED_OUTPUT
[
i
].
startswith
(
output_tp
[
i
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_cuda_alike
(),
reason
=
"Skipping to avoid redundant model tests"
)
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"MiniCPM-V dependency xformers incompatible with ROCm"
)
...
...
tests/lora/test_phi.py
View file @
fcfc474d
...
...
@@ -12,6 +12,14 @@ MODEL_PATH = os.path.join(models_path_prefix, "microsoft/phi-2")
PROMPT_TEMPLATE
=
"### Instruct: {sql_prompt}
\n\n
### Context: {context}
\n\n
### Output:"
# noqa: E501
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
PROMPT_TEMPLATE
.
format
(
...
...
@@ -50,14 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@
pytest
.
mark
.
skip_v1
...
...
tests/lora/test_quant_model.py
View file @
fcfc474d
...
...
@@ -40,6 +40,14 @@ else:
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
,
...
...
@@ -72,14 +80,6 @@ def do_sample(llm: vllm.LLM,
return
generated_texts
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_quant_model_lora
(
tinyllama_lora_files
,
num_gpus_available
,
model
,
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment