update kernel test

21c06ecb · zhuwenwen · 103f3110 · 21c06ecb · 21c06ecb · 21c06ecb
Commit 21c06ecb authored Jun 15, 2024 by zhuwenwen
5 changed files
--- a/README.md
+++ b/README.md
@@ -53,12 +53,12 @@ git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的
 - 提供2种源码编译方式（进入vllm目录）：
 ```
 1. 编译whl包并安装
-python setup.py bdist_wheel 
+VLLM_INSTALL_PUNICA_KERNELS=1 python setup.py bdist_wheel 
 cd dist
 pip install vllm*

 2. 源码编译安装
-python3 setup.py install 
+VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install 
 ```

 #### 运行基础环境准备

--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -10,7 +10,7 @@ from vllm.attention.selector import which_attn_to_use

 @pytest.mark.parametrize(
    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+@pytest.mark.parametrize("device", ["cpu", "hip"])
 def test_env(name: str, device: str, monkeypatch):
    """Test that the attention selector can be set via environment variable.
    Note that we do not test FlashAttn because it is the default backend.

--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -29,7 +29,7 @@ NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 HEAD_SIZES = [64, 112]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
-KV_CACHE_DTYPE = ["auto", "fp8"]
+KV_CACHE_DTYPE = ["auto", "fp8"] if not is_hip() else ["auto"]
 SEEDS = [0]
 CUDA_DEVICES = ['cuda:0']
 BLOCKSPARSE_LOCAL_BLOCKS = [16]

--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -345,37 +345,37 @@ def test_swap_blocks(
                              dist_value_caches[0][dst].cpu())


-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_fp8_e4m3_conversion(
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-
-    low = -224.0
-    high = 224.0
-    shape = (num_blocks, num_heads, head_size, block_size)
-    cache = torch.empty(shape, dtype=dtype, device=device)
-    cache.uniform_(low, high)
-
-    cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
-    ops.convert_fp8(cache_fp8, cache)
-
-    converted_cache = torch.empty_like(cache)
-    ops.convert_fp8(converted_cache, cache_fp8)
-
-    assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
+# @pytest.mark.parametrize("num_heads", NUM_HEADS)
+# @pytest.mark.parametrize("head_size", HEAD_SIZES)
+# @pytest.mark.parametrize("block_size", BLOCK_SIZES)
+# @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+# @pytest.mark.parametrize("dtype", DTYPES)
+# @pytest.mark.parametrize("seed", SEEDS)
+# @pytest.mark.parametrize("device", CUDA_DEVICES)
+# @torch.inference_mode()
+# def test_fp8_e4m3_conversion(
+#     num_heads: int,
+#     head_size: int,
+#     block_size: int,
+#     num_blocks: int,
+#     dtype: torch.dtype,
+#     seed: int,
+#     device: str,
+# ) -> None:
+#     random.seed(seed)
+#     torch.random.manual_seed(seed)
+#     torch.cuda.manual_seed(seed)
+
+#     low = -224.0
+#     high = 224.0
+#     shape = (num_blocks, num_heads, head_size, block_size)
+#     cache = torch.empty(shape, dtype=dtype, device=device)
+#     cache.uniform_(low, high)
+
+#     cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
+#     ops.convert_fp8(cache_fp8, cache)
+
+#     converted_cache = torch.empty_like(cache)
+#     ops.convert_fp8(converted_cache, cache_fp8)
+
+#     assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    marlin_quantize, marlin_weights)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    gptq_pack, quantize_weights, sort_weights)
+from vllm.utils import is_hip

 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -43,7 +44,7 @@ def rand_data(shape):
    return torch.randn(shape, dtype=torch.half, device="cuda")


-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_marlin_supported() or is_hip(),
                    reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -106,7 +107,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
    assert torch.allclose(marlin_q_w_1, marlin_q_w_2)


-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_marlin_supported() or is_hip(),
                    reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -171,7 +172,7 @@ def test_marlin_gemm(
    assert max_diff < 0.04


-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_marlin_supported() or is_hip(),
                    reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)