Commit 21c06ecb authored by zhuwenwen's avatar zhuwenwen
Browse files

update kernel test

parent 103f3110
......@@ -53,12 +53,12 @@ git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的
- 提供2种源码编译方式(进入vllm目录):
```
1. 编译whl包并安装
python setup.py bdist_wheel
VLLM_INSTALL_PUNICA_KERNELS=1 python setup.py bdist_wheel
cd dist
pip install vllm*
2. 源码编译安装
python3 setup.py install
VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
```
#### 运行基础环境准备
......
......@@ -10,7 +10,7 @@ from vllm.attention.selector import which_attn_to_use
@pytest.mark.parametrize(
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
@pytest.mark.parametrize("device", ["cpu", "hip"])
def test_env(name: str, device: str, monkeypatch):
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
......
......@@ -29,7 +29,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
HEAD_SIZES = [64, 112]
BLOCK_SIZES = [16, 32]
USE_ALIBI = [False, True]
KV_CACHE_DTYPE = ["auto", "fp8"]
KV_CACHE_DTYPE = ["auto", "fp8"] if not is_hip() else ["auto"]
SEEDS = [0]
CUDA_DEVICES = ['cuda:0']
BLOCKSPARSE_LOCAL_BLOCKS = [16]
......
......@@ -345,37 +345,37 @@ def test_swap_blocks(
dist_value_caches[0][dst].cpu())
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_fp8_e4m3_conversion(
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
seed: int,
device: str,
) -> None:
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
low = -224.0
high = 224.0
shape = (num_blocks, num_heads, head_size, block_size)
cache = torch.empty(shape, dtype=dtype, device=device)
cache.uniform_(low, high)
cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
ops.convert_fp8(cache_fp8, cache)
converted_cache = torch.empty_like(cache)
ops.convert_fp8(converted_cache, cache_fp8)
assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
# @pytest.mark.parametrize("num_heads", NUM_HEADS)
# @pytest.mark.parametrize("head_size", HEAD_SIZES)
# @pytest.mark.parametrize("block_size", BLOCK_SIZES)
# @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
# @pytest.mark.parametrize("dtype", DTYPES)
# @pytest.mark.parametrize("seed", SEEDS)
# @pytest.mark.parametrize("device", CUDA_DEVICES)
# @torch.inference_mode()
# def test_fp8_e4m3_conversion(
# num_heads: int,
# head_size: int,
# block_size: int,
# num_blocks: int,
# dtype: torch.dtype,
# seed: int,
# device: str,
# ) -> None:
# random.seed(seed)
# torch.random.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# low = -224.0
# high = 224.0
# shape = (num_blocks, num_heads, head_size, block_size)
# cache = torch.empty(shape, dtype=dtype, device=device)
# cache.uniform_(low, high)
# cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
# ops.convert_fp8(cache_fp8, cache)
# converted_cache = torch.empty_like(cache)
# ops.convert_fp8(converted_cache, cache_fp8)
# assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
......@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_quantize, marlin_weights)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, quantize_weights, sort_weights)
from vllm.utils import is_hip
ACT_ORDER_OPTS = [False, True]
K_FULL_OPTS = [False, True]
......@@ -43,7 +44,7 @@ def rand_data(shape):
return torch.randn(shape, dtype=torch.half, device="cuda")
@pytest.mark.skipif(not is_marlin_supported(),
@pytest.mark.skipif(not is_marlin_supported() or is_hip(),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
......@@ -106,7 +107,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
@pytest.mark.skipif(not is_marlin_supported(),
@pytest.mark.skipif(not is_marlin_supported() or is_hip(),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
......@@ -171,7 +172,7 @@ def test_marlin_gemm(
assert max_diff < 0.04
@pytest.mark.skipif(not is_marlin_supported(),
@pytest.mark.skipif(not is_marlin_supported() or is_hip(),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment