Unverified Commit 6317a517 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

Categorize `tests/kernels/` based on kernel type (#16799)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent aa72d9a4
...@@ -16,7 +16,7 @@ import numpy ...@@ -16,7 +16,7 @@ import numpy
import pytest import pytest
import yaml import yaml
RTOL = 0.05 RTOL = 0.08
TEST_DATA_FILE = os.environ.get( TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE", "LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
......
...@@ -317,15 +317,46 @@ steps: ...@@ -317,15 +317,46 @@ steps:
commands: commands:
- pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each - label: Kernels Core Operation Test
mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- tests/kernels/core
commands:
- pytest -v -s kernels/core
- label: Kernels Attention Test %N
source_file_dependencies:
- csrc/attention/
- vllm/attention - vllm/attention
- tests/kernels - vllm/v1/attention
- tests/kernels/attention
commands: commands:
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4 parallelism: 2
- label: Kernels Quantization Test %N
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test
source_file_dependencies:
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
commands:
- pytest -v -s kernels/moe
- label: Kernels Mamba Test
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
commands:
- pytest -v -s kernels/mamba
- label: Tensorizer Test # 11min - label: Tensorizer Test # 11min
# mirror_hardwares: [amd] # mirror_hardwares: [amd]
......
...@@ -6,13 +6,12 @@ from typing import Optional ...@@ -6,13 +6,12 @@ from typing import Optional
import pytest import pytest
import torch import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes from vllm.utils import get_max_shared_memory_bytes
from .allclose_default import get_default_atol, get_default_rtol
if not current_platform.is_rocm(): if not current_platform.is_rocm():
from xformers import ops as xops from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
......
...@@ -156,6 +156,15 @@ def test_env( ...@@ -156,6 +156,15 @@ def test_env(
expected = ("TRITON_MLA_VLLM_V1" expected = ("TRITON_MLA_VLLM_V1"
if use_v1 else "TRITON_MLA") if use_v1 else "TRITON_MLA")
assert backend.get_name() == expected assert backend.get_name() == expected
elif name == "FLASHINFER":
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
assert backend.get_name() == expected
else: else:
backend = get_attn_backend(16, backend = get_attn_backend(16,
torch.float16, torch.float16,
......
...@@ -6,14 +6,13 @@ from typing import Optional ...@@ -6,14 +6,13 @@ from typing import Optional
import pytest import pytest
import torch import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import ( from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn) LocalStridedBlockSparseAttn)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes from vllm.utils import get_max_shared_memory_bytes
from .allclose_default import get_default_atol, get_default_rtol
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability. # This will change depending on the compute capability.
# - 512 as a buffer # - 512 as a buffer
......
...@@ -5,6 +5,7 @@ import random ...@@ -5,6 +5,7 @@ import random
import pytest import pytest
import torch import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
GeluAndMul, MulAndSilu, GeluAndMul, MulAndSilu,
...@@ -12,8 +13,6 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, ...@@ -12,8 +13,6 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
SiluAndMul) SiluAndMul)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .allclose_default import get_default_atol, get_default_rtol
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
D = [512, 13824] # Arbitrary values for testing D = [512, 13824] # Arbitrary values for testing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment