Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
235366fe
Unverified
Commit
235366fe
authored
Nov 05, 2024
by
Michael Goin
Committed by
GitHub
Nov 05, 2024
Browse files
[CI] Prune back the number of tests in tests/kernels/* (#9932)
Signed-off-by:
mgoin
<
michael@neuralmagic.com
>
parent
02462465
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
60 additions
and
36 deletions
+60
-36
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+1
-1
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+1
-1
tests/kernels/test_awq_marlin.py
tests/kernels/test_awq_marlin.py
+10
-6
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_blocksparse_attention.py
+3
-3
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+1
-1
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+24
-6
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+3
-4
tests/kernels/test_marlin_gemm.py
tests/kernels/test_marlin_gemm.py
+1
-1
tests/kernels/test_moe.py
tests/kernels/test_moe.py
+13
-10
tests/kernels/test_pos_encoding.py
tests/kernels/test_pos_encoding.py
+3
-3
No files found.
tests/kernels/test_activation.py
View file @
235366fe
...
@@ -14,7 +14,7 @@ from .allclose_default import get_default_atol, get_default_rtol
...
@@ -14,7 +14,7 @@ from .allclose_default import get_default_atol, get_default_rtol
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
7
,
83
,
2048
]
# Arbitrary values for testing
NUM_TOKENS
=
[
7
,
83
,
2048
]
# Arbitrary values for testing
D
=
[
512
,
4096
,
5120
,
13824
]
# Arbitrary values for testing
D
=
[
512
,
13824
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
...
...
tests/kernels/test_attention.py
View file @
235366fe
...
@@ -33,7 +33,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
...
@@ -33,7 +33,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
# FlashAttention forward only supports head dimension at most 128
# FlashAttention forward only supports head dimension at most 128
# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
120
,
128
,
192
,
256
]
HEAD_SIZES
=
[
64
,
80
,
120
,
256
]
BLOCK_SIZES
=
[
16
,
32
]
BLOCK_SIZES
=
[
16
,
32
]
USE_ALIBI
=
[
False
,
True
]
USE_ALIBI
=
[
False
,
True
]
...
...
tests/kernels/test_awq_marlin.py
View file @
235366fe
...
@@ -14,13 +14,17 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
...
@@ -14,13 +14,17 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
awq_marlin_quantize
)
awq_marlin_quantize
)
from
vllm.scalar_type
import
scalar_types
from
vllm.scalar_type
import
scalar_types
NUM_EXPERTS
=
[
8
,
64
]
TOP_KS
=
[
2
,
6
]
GROUP_SIZES
=
[
-
1
,
32
,
128
]
@
pytest
.
mark
.
parametrize
(
"m"
,
[
64
,
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
128
,
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
33
,
64
,
222
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
1024
,
512
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
128
,
2048
])
@
pytest
.
mark
.
parametrize
(
"e"
,
[
8
,
64
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
1024
])
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
6
])
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
[
-
1
,
32
,
64
,
128
])
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GROUP_SIZES
)
@
pytest
.
mark
.
skipif
(
not
(
ops
.
supports_moe_ops
@
pytest
.
mark
.
skipif
(
not
(
ops
.
supports_moe_ops
and
hasattr
(
torch
.
ops
.
_moe_C
,
"marlin_gemm_moe"
)),
and
hasattr
(
torch
.
ops
.
_moe_C
,
"marlin_gemm_moe"
)),
reason
=
"Marlin is not supported on this GPU type."
)
reason
=
"Marlin is not supported on this GPU type."
)
...
...
tests/kernels/test_blocksparse_attention.py
View file @
235366fe
...
@@ -25,10 +25,10 @@ PARTITION_SIZE = 512
...
@@ -25,10 +25,10 @@ PARTITION_SIZE = 512
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
]
NUM_GEN_SEQS
=
[
3
]
# Arbitrary values for testing
NUM_GEN_SEQS
=
[
3
]
# Arbitrary values for testing
NUM_PREFILL_SEQS
=
[
3
]
# Arbitrary values for testing
NUM_PREFILL_SEQS
=
[
3
]
# Arbitrary values for testing
NUM_HEADS
=
[(
40
,
40
)
,
(
64
,
8
)
]
# Arbitrary values for testing
NUM_HEADS
=
[(
40
,
40
)]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
112
]
HEAD_SIZES
=
[
64
,
112
]
BLOCK_SIZES
=
[
16
,
32
]
BLOCK_SIZES
=
[
16
]
USE_ALIBI
=
[
False
,
True
]
USE_ALIBI
=
[
False
,
True
]
KV_CACHE_DTYPE
=
[
"auto"
,
"fp8"
]
KV_CACHE_DTYPE
=
[
"auto"
,
"fp8"
]
SEEDS
=
[
0
]
SEEDS
=
[
0
]
...
@@ -37,7 +37,7 @@ BLOCKSPARSE_LOCAL_BLOCKS = [16]
...
@@ -37,7 +37,7 @@ BLOCKSPARSE_LOCAL_BLOCKS = [16]
BLOCKSPARSE_VERT_STRIDES
=
[
8
]
BLOCKSPARSE_VERT_STRIDES
=
[
8
]
BLOCKSPARSE_BLOCK_SIZES
=
[
64
]
BLOCKSPARSE_BLOCK_SIZES
=
[
64
]
BLOCKSPARSE_HEADS_SLIDINGS
=
[
0
,
2
,
-
1
]
BLOCKSPARSE_HEADS_SLIDINGS
=
[
2
,
-
1
]
BLOCKSPARSE_HOMO_HEADS
=
[
True
,
False
]
BLOCKSPARSE_HOMO_HEADS
=
[
True
,
False
]
...
...
tests/kernels/test_cache.py
View file @
235366fe
...
@@ -13,7 +13,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
...
@@ -13,7 +13,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS
=
[
42
]
# Arbitrary values for testing
NUM_TOKENS
=
[
42
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
120
,
128
,
192
,
256
]
HEAD_SIZES
=
[
64
,
80
,
120
,
256
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
# Arbitrary values for testing
# Arbitrary values for testing
...
...
tests/kernels/test_cutlass.py
View file @
235366fe
...
@@ -11,6 +11,28 @@ from tests.kernels.utils import opcheck
...
@@ -11,6 +11,28 @@ from tests.kernels.utils import opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
MNK_FACTORS
=
[
(
1
,
256
,
128
),
(
1
,
16384
,
1024
),
(
1
,
24576
,
496
),
(
16
,
256
,
496
),
(
16
,
16384
,
128
),
(
16
,
24576
,
4096
),
(
32
,
8192
,
4096
),
(
32
,
16384
,
4096
),
(
33
,
1024
,
1024
),
(
33
,
8192
,
128
),
(
64
,
2048
,
496
),
(
64
,
16384
,
1024
),
(
100
,
8192
,
496
),
(
128
,
32768
,
4096
),
(
256
,
4096
,
4096
),
(
512
,
256
,
1024
),
(
512
,
8192
,
4096
),
(
512
,
16384
,
128
),
(
512
,
24576
,
128
),
]
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
]
...
@@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int,
...
@@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int,
(
out
,
a
,
b
,
scale_a
,
scale_b
,
bias
))
(
out
,
a
,
b
,
scale_a
,
scale_b
,
bias
))
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
16
,
32
,
64
,
128
,
256
,
512
,
222
,
100
,
33
])
@
pytest
.
mark
.
parametrize
(
"m,n,k"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
4096
,
8192
,
16384
,
24576
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
...
@@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
...
@@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
cutlass_fp8_gemm_helper
(
m
,
n
,
k
,
per_act_token
,
per_out_ch
,
use_bias
)
cutlass_fp8_gemm_helper
(
m
,
n
,
k
,
per_act_token
,
per_out_ch
,
use_bias
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
16
,
32
,
64
,
128
,
256
,
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"m,n,k"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
8192
,
16384
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
...
...
tests/kernels/test_int8_quant.py
View file @
235366fe
...
@@ -7,11 +7,10 @@ from vllm._custom_ops import scaled_int8_quant
...
@@ -7,11 +7,10 @@ from vllm._custom_ops import scaled_int8_quant
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
HIDDEN_SIZES
=
[
16
,
67
,
768
,
5137
,
8193
]
# Arbitrary values for testing
8193
]
# Arbitrary values for testing
NUM_TOKENS
=
[
1
,
7
,
83
,
4096
]
# Arbitrary values for testing
NUM_TOKENS
=
[
1
,
7
,
83
,
4096
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
SCALE
=
[
0.1
,
0.5
,
0.8
,
1.2
,
2.1
]
SCALE
=
[
0.1
,
2.1
]
def
opcheck_int8_quant_static
(
output
,
input
,
scale
,
azp
=
None
):
def
opcheck_int8_quant_static
(
output
,
input
,
scale
,
azp
=
None
):
...
@@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
...
@@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"scale"
,
SCALE
[
2
:])
# Reduce test time
@
pytest
.
mark
.
parametrize
(
"scale"
,
SCALE
)
@
pytest
.
mark
.
parametrize
(
"azp"
,
[
-
255
,
54
])
@
pytest
.
mark
.
parametrize
(
"azp"
,
[
-
255
,
54
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_static_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
def
test_static_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
...
...
tests/kernels/test_marlin_gemm.py
View file @
235366fe
...
@@ -35,7 +35,7 @@ K_FULL_OPTS = [False, True]
...
@@ -35,7 +35,7 @@ K_FULL_OPTS = [False, True]
USE_FP32_REDUCE_OPTS
=
[
False
,
True
]
USE_FP32_REDUCE_OPTS
=
[
False
,
True
]
MARLIN_K_CHUNKS
=
[
128
]
MARLIN_K_CHUNKS
=
[
128
]
MARLIN_N_CHUNKS
=
[
64
,
128
,
256
]
MARLIN_N_CHUNKS
=
[
64
,
256
]
MARLIN_24_K_CHUNKS
=
[
128
]
MARLIN_24_K_CHUNKS
=
[
128
]
MARLIN_24_N_CHUNKS
=
[
512
]
MARLIN_24_N_CHUNKS
=
[
512
]
...
...
tests/kernels/test_moe.py
View file @
235366fe
...
@@ -20,12 +20,15 @@ from vllm.model_executor.models.mixtral import MixtralMoE
...
@@ -20,12 +20,15 @@ from vllm.model_executor.models.mixtral import MixtralMoE
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.scalar_type
import
scalar_types
NUM_EXPERTS
=
[
8
,
64
]
TOP_KS
=
[
2
,
6
]
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1024
*
128
,
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
33
,
64
,
222
,
1024
*
128
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
128
,
1024
,
2048
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
511
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
511
,
1024
])
@
pytest
.
mark
.
parametrize
(
"e"
,
[
8
,
64
]
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
6
]
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_fused_moe
(
def
test_fused_moe
(
m
:
int
,
m
:
int
,
...
@@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype):
...
@@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype):
atol
=
mixtral_moe_tol
[
dtype
])
atol
=
mixtral_moe_tol
[
dtype
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
64
,
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
33
,
64
,
222
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
128
,
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
128
,
2048
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
1024
,
512
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
1024
])
@
pytest
.
mark
.
parametrize
(
"e"
,
[
8
,
64
]
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
6
]
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
[
-
1
,
32
,
64
,
128
])
@
pytest
.
mark
.
parametrize
(
"group_size"
,
[
-
1
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"act_order"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"act_order"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
[
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
[
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"is_k_full"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"is_k_full"
,
[
True
,
False
])
...
...
tests/kernels/test_pos_encoding.py
View file @
235366fe
...
@@ -11,10 +11,10 @@ from .allclose_default import get_default_atol, get_default_rtol
...
@@ -11,10 +11,10 @@ from .allclose_default import get_default_atol, get_default_rtol
IS_NEOX_STYLE
=
[
True
,
False
]
IS_NEOX_STYLE
=
[
True
,
False
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
120
,
128
,
192
,
256
]
HEAD_SIZES
=
[
64
,
80
,
112
,
120
,
256
]
ROTARY_DIMS
=
[
None
,
32
]
# None means rotary dim == head size
ROTARY_DIMS
=
[
None
,
32
]
# None means rotary dim == head size
NUM_HEADS
=
[
7
,
17
]
# Arbitrary values for testing
NUM_HEADS
=
[
17
]
# Arbitrary values for testing
BATCH_SIZES
=
[
1
,
5
]
# Arbitrary values for testing
BATCH_SIZES
=
[
5
]
# Arbitrary values for testing
SEQ_LENS
=
[
11
,
8192
]
# Arbitrary values for testing
SEQ_LENS
=
[
11
,
8192
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment