Commit 21c06ecb authored by zhuwenwen's avatar zhuwenwen
Browse files

update kernel test

parent 103f3110
...@@ -53,12 +53,12 @@ git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的 ...@@ -53,12 +53,12 @@ git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的
- 提供2种源码编译方式(进入vllm目录): - 提供2种源码编译方式(进入vllm目录):
``` ```
1. 编译whl包并安装 1. 编译whl包并安装
python setup.py bdist_wheel VLLM_INSTALL_PUNICA_KERNELS=1 python setup.py bdist_wheel
cd dist cd dist
pip install vllm* pip install vllm*
2. 源码编译安装 2. 源码编译安装
python3 setup.py install VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
``` ```
#### 运行基础环境准备 #### 运行基础环境准备
......
...@@ -10,7 +10,7 @@ from vllm.attention.selector import which_attn_to_use ...@@ -10,7 +10,7 @@ from vllm.attention.selector import which_attn_to_use
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "hip"])
def test_env(name: str, device: str, monkeypatch): def test_env(name: str, device: str, monkeypatch):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
......
...@@ -29,7 +29,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing ...@@ -29,7 +29,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
HEAD_SIZES = [64, 112] HEAD_SIZES = [64, 112]
BLOCK_SIZES = [16, 32] BLOCK_SIZES = [16, 32]
USE_ALIBI = [False, True] USE_ALIBI = [False, True]
KV_CACHE_DTYPE = ["auto", "fp8"] KV_CACHE_DTYPE = ["auto", "fp8"] if not is_hip() else ["auto"]
SEEDS = [0] SEEDS = [0]
CUDA_DEVICES = ['cuda:0'] CUDA_DEVICES = ['cuda:0']
BLOCKSPARSE_LOCAL_BLOCKS = [16] BLOCKSPARSE_LOCAL_BLOCKS = [16]
......
...@@ -345,37 +345,37 @@ def test_swap_blocks( ...@@ -345,37 +345,37 @@ def test_swap_blocks(
dist_value_caches[0][dst].cpu()) dist_value_caches[0][dst].cpu())
@pytest.mark.parametrize("num_heads", NUM_HEADS) # @pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES) # @pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES) # @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) # @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES) # @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) # @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES) # @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() # @torch.inference_mode()
def test_fp8_e4m3_conversion( # def test_fp8_e4m3_conversion(
num_heads: int, # num_heads: int,
head_size: int, # head_size: int,
block_size: int, # block_size: int,
num_blocks: int, # num_blocks: int,
dtype: torch.dtype, # dtype: torch.dtype,
seed: int, # seed: int,
device: str, # device: str,
) -> None: # ) -> None:
random.seed(seed) # random.seed(seed)
torch.random.manual_seed(seed) # torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) # torch.cuda.manual_seed(seed)
low = -224.0 # low = -224.0
high = 224.0 # high = 224.0
shape = (num_blocks, num_heads, head_size, block_size) # shape = (num_blocks, num_heads, head_size, block_size)
cache = torch.empty(shape, dtype=dtype, device=device) # cache = torch.empty(shape, dtype=dtype, device=device)
cache.uniform_(low, high) # cache.uniform_(low, high)
cache_fp8 = torch.empty_like(cache, dtype=torch.uint8) # cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
ops.convert_fp8(cache_fp8, cache) # ops.convert_fp8(cache_fp8, cache)
converted_cache = torch.empty_like(cache) # converted_cache = torch.empty_like(cache)
ops.convert_fp8(converted_cache, cache_fp8) # ops.convert_fp8(converted_cache, cache_fp8)
assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1) # assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
...@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( ...@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_quantize, marlin_weights) marlin_quantize, marlin_weights)
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, quantize_weights, sort_weights) gptq_pack, quantize_weights, sort_weights)
from vllm.utils import is_hip
ACT_ORDER_OPTS = [False, True] ACT_ORDER_OPTS = [False, True]
K_FULL_OPTS = [False, True] K_FULL_OPTS = [False, True]
...@@ -43,7 +44,7 @@ def rand_data(shape): ...@@ -43,7 +44,7 @@ def rand_data(shape):
return torch.randn(shape, dtype=torch.half, device="cuda") return torch.randn(shape, dtype=torch.half, device="cuda")
@pytest.mark.skipif(not is_marlin_supported(), @pytest.mark.skipif(not is_marlin_supported() or is_hip(),
reason="Marlin is not supported on this GPU type.") reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
...@@ -106,7 +107,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order, ...@@ -106,7 +107,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
assert torch.allclose(marlin_q_w_1, marlin_q_w_2) assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
@pytest.mark.skipif(not is_marlin_supported(), @pytest.mark.skipif(not is_marlin_supported() or is_hip(),
reason="Marlin is not supported on this GPU type.") reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
...@@ -171,7 +172,7 @@ def test_marlin_gemm( ...@@ -171,7 +172,7 @@ def test_marlin_gemm(
assert max_diff < 0.04 assert max_diff < 0.04
@pytest.mark.skipif(not is_marlin_supported(), @pytest.mark.skipif(not is_marlin_supported() or is_hip(),
reason="Marlin is not supported on this GPU type.") reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS) @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS) @pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment