Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
21c06ecb
Commit
21c06ecb
authored
Jun 15, 2024
by
zhuwenwen
Browse files
update kernel test
parent
103f3110
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
42 additions
and
41 deletions
+42
-41
README.md
README.md
+2
-2
tests/kernels/test_attention_selector.py
tests/kernels/test_attention_selector.py
+1
-1
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_blocksparse_attention.py
+1
-1
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+34
-34
tests/kernels/test_marlin_gemm.py
tests/kernels/test_marlin_gemm.py
+4
-3
No files found.
README.md
View file @
21c06ecb
...
@@ -53,12 +53,12 @@ git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的
...
@@ -53,12 +53,12 @@ git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的
-
提供2种源码编译方式(进入vllm目录):
-
提供2种源码编译方式(进入vllm目录):
```
```
1. 编译whl包并安装
1. 编译whl包并安装
python setup.py bdist_wheel
VLLM_INSTALL_PUNICA_KERNELS=1
python setup.py bdist_wheel
cd dist
cd dist
pip install vllm*
pip install vllm*
2. 源码编译安装
2. 源码编译安装
python3 setup.py install
VLLM_INSTALL_PUNICA_KERNELS=1
python3 setup.py install
```
```
#### 运行基础环境准备
#### 运行基础环境准备
...
...
tests/kernels/test_attention_selector.py
View file @
21c06ecb
...
@@ -10,7 +10,7 @@ from vllm.attention.selector import which_attn_to_use
...
@@ -10,7 +10,7 @@ from vllm.attention.selector import which_attn_to_use
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"name"
,
[
"TORCH_SDPA"
,
"ROCM_FLASH"
,
"XFORMERS"
,
"FLASHINFER"
])
"name"
,
[
"TORCH_SDPA"
,
"ROCM_FLASH"
,
"XFORMERS"
,
"FLASHINFER"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
])
def
test_env
(
name
:
str
,
device
:
str
,
monkeypatch
):
def
test_env
(
name
:
str
,
device
:
str
,
monkeypatch
):
"""Test that the attention selector can be set via environment variable.
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
Note that we do not test FlashAttn because it is the default backend.
...
...
tests/kernels/test_blocksparse_attention.py
View file @
21c06ecb
...
@@ -29,7 +29,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
...
@@ -29,7 +29,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
HEAD_SIZES
=
[
64
,
112
]
HEAD_SIZES
=
[
64
,
112
]
BLOCK_SIZES
=
[
16
,
32
]
BLOCK_SIZES
=
[
16
,
32
]
USE_ALIBI
=
[
False
,
True
]
USE_ALIBI
=
[
False
,
True
]
KV_CACHE_DTYPE
=
[
"auto"
,
"fp8"
]
KV_CACHE_DTYPE
=
[
"auto"
,
"fp8"
]
if
not
is_hip
()
else
[
"auto"
]
SEEDS
=
[
0
]
SEEDS
=
[
0
]
CUDA_DEVICES
=
[
'cuda:0'
]
CUDA_DEVICES
=
[
'cuda:0'
]
BLOCKSPARSE_LOCAL_BLOCKS
=
[
16
]
BLOCKSPARSE_LOCAL_BLOCKS
=
[
16
]
...
...
tests/kernels/test_cache.py
View file @
21c06ecb
...
@@ -345,37 +345,37 @@ def test_swap_blocks(
...
@@ -345,37 +345,37 @@ def test_swap_blocks(
dist_value_caches
[
0
][
dst
].
cpu
())
dist_value_caches
[
0
][
dst
].
cpu
())
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
#
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
#
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
#
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
#
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
#
@pytest.mark.parametrize("dtype", DTYPES)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
#
@pytest.mark.parametrize("seed", SEEDS)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
#
@pytest.mark.parametrize("device", CUDA_DEVICES)
@
torch
.
inference_mode
()
#
@torch.inference_mode()
def
test_fp8_e4m3_conversion
(
#
def test_fp8_e4m3_conversion(
num_heads
:
int
,
#
num_heads: int,
head_size
:
int
,
#
head_size: int,
block_size
:
int
,
#
block_size: int,
num_blocks
:
int
,
#
num_blocks: int,
dtype
:
torch
.
dtype
,
#
dtype: torch.dtype,
seed
:
int
,
#
seed: int,
device
:
str
,
#
device: str,
)
->
None
:
#
) -> None:
random
.
seed
(
seed
)
#
random.seed(seed)
torch
.
random
.
manual_seed
(
seed
)
#
torch.random.manual_seed(seed)
torch
.
cuda
.
manual_seed
(
seed
)
#
torch.cuda.manual_seed(seed)
low
=
-
224.0
#
low = -224.0
high
=
224.0
#
high = 224.0
shape
=
(
num_blocks
,
num_heads
,
head_size
,
block_size
)
#
shape = (num_blocks, num_heads, head_size, block_size)
cache
=
torch
.
empty
(
shape
,
dtype
=
dtype
,
device
=
device
)
#
cache = torch.empty(shape, dtype=dtype, device=device)
cache
.
uniform_
(
low
,
high
)
#
cache.uniform_(low, high)
cache_fp8
=
torch
.
empty_like
(
cache
,
dtype
=
torch
.
uint8
)
#
cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
ops
.
convert_fp8
(
cache_fp8
,
cache
)
#
ops.convert_fp8(cache_fp8, cache)
converted_cache
=
torch
.
empty_like
(
cache
)
#
converted_cache = torch.empty_like(cache)
ops
.
convert_fp8
(
converted_cache
,
cache_fp8
)
#
ops.convert_fp8(converted_cache, cache_fp8)
assert
torch
.
allclose
(
cache
,
converted_cache
,
atol
=
0.001
,
rtol
=
0.1
)
#
assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
tests/kernels/test_marlin_gemm.py
View file @
21c06ecb
...
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
...
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_quantize
,
marlin_weights
)
marlin_quantize
,
marlin_weights
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
gptq_pack
,
quantize_weights
,
sort_weights
)
gptq_pack
,
quantize_weights
,
sort_weights
)
from
vllm.utils
import
is_hip
ACT_ORDER_OPTS
=
[
False
,
True
]
ACT_ORDER_OPTS
=
[
False
,
True
]
K_FULL_OPTS
=
[
False
,
True
]
K_FULL_OPTS
=
[
False
,
True
]
...
@@ -43,7 +44,7 @@ def rand_data(shape):
...
@@ -43,7 +44,7 @@ def rand_data(shape):
return
torch
.
randn
(
shape
,
dtype
=
torch
.
half
,
device
=
"cuda"
)
return
torch
.
randn
(
shape
,
dtype
=
torch
.
half
,
device
=
"cuda"
)
@
pytest
.
mark
.
skipif
(
not
is_marlin_supported
(),
@
pytest
.
mark
.
skipif
(
not
is_marlin_supported
()
or
is_hip
()
,
reason
=
"Marlin is not supported on this GPU type."
)
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
...
@@ -106,7 +107,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
...
@@ -106,7 +107,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
assert
torch
.
allclose
(
marlin_q_w_1
,
marlin_q_w_2
)
assert
torch
.
allclose
(
marlin_q_w_1
,
marlin_q_w_2
)
@
pytest
.
mark
.
skipif
(
not
is_marlin_supported
(),
@
pytest
.
mark
.
skipif
(
not
is_marlin_supported
()
or
is_hip
()
,
reason
=
"Marlin is not supported on this GPU type."
)
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
...
@@ -171,7 +172,7 @@ def test_marlin_gemm(
...
@@ -171,7 +172,7 @@ def test_marlin_gemm(
assert
max_diff
<
0.04
assert
max_diff
<
0.04
@
pytest
.
mark
.
skipif
(
not
is_marlin_supported
(),
@
pytest
.
mark
.
skipif
(
not
is_marlin_supported
()
or
is_hip
()
,
reason
=
"Marlin is not supported on this GPU type."
)
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_24_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_24_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_24_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_24_N_CHUNKS
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment