[CI] Prune back the number of tests in tests/kernels/* (#9932)

Signed-off-by: mgoin <michael@neuralmagic.com>

[CI] Prune back the number of tests in tests/kernels/* (#9932)
Signed-off-by: mgoin <michael@neuralmagic.com>
235366fe · Michael Goin · GitHub · 02462465 · 235366fe · 235366fe
Unverified Commit 235366fe authored Nov 05, 2024 by Michael Goin Committed by GitHub Nov 05, 2024
10 changed files
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -14,7 +14,7 @@ from .allclose_default import get_default_atol, get_default_rtol
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
-D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
+D = [512, 13824]  # Arbitrary values for testing
 SEEDS = [0]
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)

--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -33,7 +33,7 @@ NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 # FlashAttention forward only supports head dimension at most 128
 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]

--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
@@ -14,13 +14,17 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    awq_marlin_quantize)
 from vllm.scalar_type import scalar_types
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
+GROUP_SIZES = [-1, 32, 128]
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
-@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("n", [128, 2048])
-@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("k", [128, 1024])
-@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("group_size", GROUP_SIZES)
 @pytest.mark.skipif(not (ops.supports_moe_ops
                         and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
                    reason="Marlin is not supported on this GPU type.")

--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -25,10 +25,10 @@ PARTITION_SIZE = 512
 DTYPES = [torch.half, torch.bfloat16]
 NUM_GEN_SEQS = [3]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
-NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
+NUM_HEADS = [(40, 40)]  # Arbitrary values for testing
 HEAD_SIZES = [64, 112]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
@@ -37,7 +37,7 @@ BLOCKSPARSE_LOCAL_BLOCKS = [16]
 BLOCKSPARSE_VERT_STRIDES = [8]
 BLOCKSPARSE_BLOCK_SIZES = [64]
-BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1]
+BLOCKSPARSE_HEADS_SLIDINGS = [2, -1]
 BLOCKSPARSE_HOMO_HEADS = [True, False]

--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -13,7 +13,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [8, 16, 32]
 # Arbitrary values for testing

--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -11,6 +11,28 @@ from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+MNK_FACTORS = [
+    (1, 256, 128),
+    (1, 16384, 1024),
+    (1, 24576, 496),
+    (16, 256, 496),
+    (16, 16384, 128),
+    (16, 24576, 4096),
+    (32, 8192, 4096),
+    (32, 16384, 4096),
+    (33, 1024, 1024),
+    (33, 8192, 128),
+    (64, 2048, 496),
+    (64, 16384, 1024),
+    (100, 8192, 496),
+    (128, 32768, 4096),
+    (256, 4096, 4096),
+    (512, 256, 1024),
+    (512, 8192, 4096),
+    (512, 16384, 128),
+    (512, 24576, 128),
+]
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
@@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int,
            (out, a, b, scale_a, scale_b, bias))
-@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33])
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024])
-@pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])
@@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
    cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
-@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1])
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
-@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024])
-@pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("use_bias", [True, False])

--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -7,11 +7,10 @@ from vllm._custom_ops import scaled_int8_quant
 from vllm.platforms import current_platform
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
+HIDDEN_SIZES = [16, 67, 768, 5137, 8193]  # Arbitrary values for testing
-                8193]  # Arbitrary values for testing
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SEEDS = [0]
-SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
+SCALE = [0.1, 2.1]
 def opcheck_int8_quant_static(output, input, scale, azp=None):
@@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("scale", SCALE[2:])  # Reduce test time
+@pytest.mark.parametrize("scale", SCALE)
 @pytest.mark.parametrize("azp", [-255, 54])
 @torch.inference_mode()
 def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,

--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -35,7 +35,7 @@ K_FULL_OPTS = [False, True]
 USE_FP32_REDUCE_OPTS = [False, True]
 MARLIN_K_CHUNKS = [128]
-MARLIN_N_CHUNKS = [64, 128, 256]
+MARLIN_N_CHUNKS = [64, 256]
 MARLIN_24_K_CHUNKS = [128]
 MARLIN_24_N_CHUNKS = [512]

--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -20,12 +20,15 @@ from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
-@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
-@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_fused_moe(
    m: int,
@@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype):
                               atol=mixtral_moe_tol[dtype])
-@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
-@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("n", [128, 2048])
-@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("k", [128, 1024])
-@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("group_size", [-1, 32, 128])
 @pytest.mark.parametrize("act_order", [True, False])
 @pytest.mark.parametrize("num_bits", [4, 8])
 @pytest.mark.parametrize("is_k_full", [True, False])

--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -11,10 +11,10 @@ from .allclose_default import get_default_atol, get_default_rtol
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [64, 80, 112, 120, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
-NUM_HEADS = [7, 17]  # Arbitrary values for testing
+NUM_HEADS = [17]  # Arbitrary values for testing
-BATCH_SIZES = [1, 5]  # Arbitrary values for testing
+BATCH_SIZES = [5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
 CUDA_DEVICES = [