Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6ffa3f31
Unverified
Commit
6ffa3f31
authored
Sep 18, 2024
by
Cyrus Leung
Committed by
GitHub
Sep 18, 2024
Browse files
[CI/Build] Avoid CUDA initialization (#8534)
parent
e3515729
Changes
55
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
66 additions
and
108 deletions
+66
-108
benchmarks/kernels/benchmark_layernorm.py
benchmarks/kernels/benchmark_layernorm.py
+3
-6
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+3
-3
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_paged_attention.py
+2
-5
benchmarks/kernels/benchmark_quant.py
benchmarks/kernels/benchmark_quant.py
+3
-6
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_rope.py
+2
-4
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+3
-6
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+5
-13
tests/kernels/test_attention_selector.py
tests/kernels/test_attention_selector.py
+1
-1
tests/kernels/test_awq_triton.py
tests/kernels/test_awq_triton.py
+3
-2
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_blocksparse_attention.py
+3
-9
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+7
-18
tests/kernels/test_causal_conv1d.py
tests/kernels/test_causal_conv1d.py
+3
-2
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+4
-7
tests/kernels/test_flash_attn.py
tests/kernels/test_flash_attn.py
+3
-2
tests/kernels/test_flashinfer.py
tests/kernels/test_flashinfer.py
+6
-4
tests/kernels/test_fp8_quant.py
tests/kernels/test_fp8_quant.py
+4
-6
tests/kernels/test_gguf.py
tests/kernels/test_gguf.py
+3
-2
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+5
-8
tests/kernels/test_layernorm.py
tests/kernels/test_layernorm.py
+2
-3
tests/kernels/test_machete_gemm.py
tests/kernels/test_machete_gemm.py
+1
-1
No files found.
benchmarks/kernels/benchmark_layernorm.py
View file @
6ffa3f31
import
random
import
time
import
torch
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
seed_everything
)
@
torch
.
inference_mode
()
...
...
@@ -16,10 +16,7 @@ def main(num_tokens: int,
do_profile
:
bool
=
False
,
num_warmup_iters
:
int
=
5
,
num_iters
:
int
=
100
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda"
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
...
...
benchmarks/kernels/benchmark_moe.py
View file @
6ffa3f31
...
...
@@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
from
transformers
import
AutoConfig
from
vllm.model_executor.layers.fused_moe.fused_moe
import
*
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
,
seed_everything
class
BenchmarkConfig
(
TypedDict
):
...
...
@@ -166,7 +166,7 @@ class BenchmarkWorker:
def
__init__
(
self
,
seed
:
int
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
seed
)
seed_everything
(
seed
)
self
.
seed
=
seed
def
benchmark
(
...
...
@@ -180,7 +180,7 @@ class BenchmarkWorker:
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
)
->
Tuple
[
Dict
[
str
,
int
],
float
]:
torch
.
cuda
.
manual_seed_all
(
self
.
seed
)
seed_everything
(
self
.
seed
)
dtype_str
=
get_config_dtype_str
(
dtype
,
use_int8_w8a16
=
use_int8_w8a16
,
use_fp8_w8a8
=
use_fp8_w8a8
)
...
...
benchmarks/kernels/benchmark_paged_attention.py
View file @
6ffa3f31
...
...
@@ -6,7 +6,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
create_kv_caches_with_random
)
create_kv_caches_with_random
,
seed_everything
)
NUM_BLOCKS
=
1024
PARTITION_SIZE
=
512
...
...
@@ -28,10 +28,7 @@ def main(
device
:
str
=
"cuda"
,
kv_cache_dtype
:
Optional
[
str
]
=
None
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
query
=
torch
.
empty
(
num_seqs
,
...
...
benchmarks/kernels/benchmark_quant.py
View file @
6ffa3f31
import
random
import
time
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
seed_everything
)
@
torch
.
inference_mode
()
...
...
@@ -17,10 +17,7 @@ def main(num_tokens: int,
do_profile
:
bool
=
False
,
num_warmup_iters
:
int
=
5
,
num_iters
:
int
=
100
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda"
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
...
...
benchmarks/kernels/benchmark_rope.py
View file @
6ffa3f31
...
...
@@ -6,7 +6,7 @@ import torch
from
vllm.model_executor.layers.rotary_embedding
import
(
RotaryEmbedding
,
get_rope
)
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
,
seed_everything
def
benchmark_rope_kernels_multi_lora
(
...
...
@@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
max_position
:
int
=
8192
,
base
:
int
=
10000
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
rotary_dim
=
head_size
...
...
tests/kernels/test_activation.py
View file @
6ffa3f31
...
...
@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck
from
vllm.model_executor.layers.activation
import
(
FastGELU
,
GeluAndMul
,
NewGELU
,
QuickGELU
,
SiluAndMul
)
from
vllm.utils
import
seed_everything
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
...
@@ -34,9 +35,7 @@ def test_act_and_mul(
seed
:
int
,
device
:
str
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
)
if
activation
==
"silu"
:
...
...
@@ -77,9 +76,7 @@ def test_activation(
seed
:
int
,
device
:
str
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
layer
=
activation
[
0
]()
...
...
tests/kernels/test_attention.py
View file @
6ffa3f31
...
...
@@ -6,7 +6,7 @@ import torch
from
tests.kernels.utils
import
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
get_max_shared_memory_bytes
,
is_hip
from
vllm.utils
import
get_max_shared_memory_bytes
,
is_hip
,
seed_everything
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
...
@@ -139,10 +139,8 @@ def test_paged_attention(
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
...
...
@@ -354,10 +352,7 @@ def test_paged_attention_rocm(
seed
:
int
,
device
:
str
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
...
...
@@ -506,10 +501,7 @@ def test_multi_query_kv_attention(
seed
:
int
,
device
:
str
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
...
...
tests/kernels/test_attention_selector.py
View file @
6ffa3f31
...
...
@@ -45,7 +45,7 @@ def test_flash_attn(monkeypatch):
override_backend_env_variable
(
monkeypatch
,
STR_FLASH_ATTN_VAL
)
# Unsupported CUDA arch
with
patch
(
"torch.cuda.get_device_capability"
,
return_value
=
[
7
,
5
]
):
with
patch
(
"torch.cuda.get_device_capability"
,
return_value
=
(
7
,
5
)
):
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
None
,
16
)
assert
backend
.
name
!=
STR_FLASH_ATTN_VAL
...
...
tests/kernels/test_awq_triton.py
View file @
6ffa3f31
...
...
@@ -7,6 +7,7 @@ import torch
from
vllm.model_executor.layers.quantization.awq_triton
import
(
AWQ_TRITON_SUPPORTED_GROUP_SIZES
,
awq_dequantize_triton
,
awq_gemm_triton
)
from
vllm.utils
import
seed_everything
device
=
"cuda"
...
...
@@ -79,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols
=
qweight_cols
zeros_dtype
=
torch
.
int32
torch
.
manual_seed
(
0
)
seed_everything
(
0
)
qweight
=
torch
.
randint
(
0
,
torch
.
iinfo
(
torch
.
int32
).
max
,
...
...
@@ -133,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows
=
scales_rows
qzeros_cols
=
qweight_cols
torch
.
manual_seed
(
0
)
seed_everything
(
0
)
input
=
torch
.
rand
((
input_rows
,
input_cols
),
dtype
=
input_dtype
,
...
...
tests/kernels/test_blocksparse_attention.py
View file @
6ffa3f31
...
...
@@ -7,7 +7,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.attention.ops.blocksparse_attention.interface
import
(
LocalStridedBlockSparseAttn
)
from
vllm.utils
import
get_max_shared_memory_bytes
,
is_hip
from
vllm.utils
import
get_max_shared_memory_bytes
,
is_hip
,
seed_everything
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
...
@@ -172,10 +172,7 @@ def test_paged_attention(
blocksparse_block_size
:
int
,
blocksparse_head_sliding_step
:
int
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
...
...
@@ -386,10 +383,7 @@ def test_varlen_blocksparse_attention_prefill(
seed
:
int
,
device
:
str
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
...
...
tests/kernels/test_cache.py
View file @
6ffa3f31
...
...
@@ -6,6 +6,7 @@ import torch
from
tests.kernels.utils
import
DEFAULT_OPCHECK_TEST_UTILS
,
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
seed_everything
COPYING_DIRECTION
=
[(
'cuda'
,
'cpu'
),
(
'cuda'
,
'cuda'
),
(
'cpu'
,
'cuda'
)]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
...
...
@@ -55,10 +56,7 @@ def test_copy_blocks(
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
# Generate random block mappings where each source block is mapped to two
# destination blocks.
...
...
@@ -134,10 +132,7 @@ def test_reshape_and_cache(
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
# Create a random slot mapping.
num_slots
=
block_size
*
num_blocks
...
...
@@ -229,9 +224,7 @@ def test_reshape_and_cache_flash(
device
:
str
,
kv_cache_dtype
:
str
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
# Create a random slot mapping.
...
...
@@ -345,10 +338,8 @@ def test_swap_blocks(
pytest
.
skip
()
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
src_device
=
device
if
direction
[
0
]
==
"cuda"
else
'cpu'
dst_device
=
device
if
direction
[
1
]
==
"cuda"
else
'cpu'
...
...
@@ -417,9 +408,7 @@ def test_fp8_e4m3_conversion(
seed
:
int
,
device
:
str
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
low
=
-
224.0
high
=
224.0
...
...
tests/kernels/test_causal_conv1d.py
View file @
6ffa3f31
...
...
@@ -7,6 +7,7 @@ from einops import rearrange
from
vllm.model_executor.layers.mamba.ops.causal_conv1d
import
(
causal_conv1d_fn
,
causal_conv1d_update
)
from
vllm.utils
import
seed_everything
def
causal_conv1d_ref
(
...
...
@@ -104,7 +105,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
# set seed
torch
.
random
.
manual_seed
(
0
)
seed_everything
(
0
)
if
not
channel_last
:
x
=
torch
.
randn
(
batch
,
4096
+
dim
+
64
,
...
...
@@ -175,7 +176,7 @@ def test_causal_conv1d_update(batch, dim, width, has_bias, silu_activation,
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
# set seed
torch
.
random
.
manual_seed
(
0
)
seed_everything
(
0
)
batch
=
2
x
=
torch
.
randn
(
batch
,
dim
,
device
=
device
,
dtype
=
itype
)
conv_state
=
torch
.
randn
(
batch
,
dim
,
width
,
device
=
device
,
dtype
=
itype
)
...
...
tests/kernels/test_cutlass.py
View file @
6ffa3f31
...
...
@@ -15,9 +15,6 @@ CUDA_DEVICES = [
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
capability
=
current_platform
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
def
to_fp8
(
tensor
:
torch
.
Tensor
):
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
...
...
@@ -119,7 +116,7 @@ def cutlass_int8_gemm_helper(m: int,
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
capability
<
89
,
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_
capability
(
89
)
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm
(
m
:
int
,
n
:
int
,
k
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
use_bias
:
bool
):
...
...
@@ -157,7 +154,7 @@ def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"out_dtype"
,
[
torch
.
bfloat16
,
torch
.
float16
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
capability
<
89
,
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_
capability
(
89
)
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm_output_dtype
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
out_dtype
:
Type
[
torch
.
dtype
],
...
...
@@ -175,7 +172,7 @@ def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
skipif
(
capability
<
89
,
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_
capability
(
89
)
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm_devices
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
use_bias
:
bool
,
device
:
str
):
...
...
@@ -207,7 +204,7 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
capability
<
89
,
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_
capability
(
89
)
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm_m_sweep
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
use_bias
:
bool
):
...
...
tests/kernels/test_flash_attn.py
View file @
6ffa3f31
...
...
@@ -4,6 +4,7 @@ import pytest
import
torch
import
vllm.attention.backends.flash_attn
# noqa: F401
from
vllm.utils
import
seed_everything
NUM_HEADS
=
[(
4
,
4
),
(
8
,
2
),
(
16
,
2
)]
HEAD_SIZES
=
[
128
,
256
]
...
...
@@ -87,7 +88,7 @@ def test_flash_attn_with_paged_kv(
num_blocks
:
int
,
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
...
...
@@ -174,7 +175,7 @@ def test_varlen_with_paged_kv(
num_blocks
:
int
,
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
...
...
tests/kernels/test_flashinfer.py
View file @
6ffa3f31
...
...
@@ -4,6 +4,8 @@ import flashinfer
import
pytest
import
torch
from
vllm.utils
import
seed_everything
NUM_HEADS
=
[(
16
,
16
),
(
32
,
8
),
(
64
,
8
),
(
6
,
1
)]
HEAD_SIZES
=
[
128
,
256
]
BLOCK_SIZES
=
[
16
,
32
]
...
...
@@ -82,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv(
soft_cap
:
Optional
[
float
],
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
...
...
@@ -168,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
...
...
@@ -266,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
...
...
@@ -379,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
)
->
None
:
# test doesn't work for num_heads = (16,16)
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
...
...
tests/kernels/test_fp8_quant.py
View file @
6ffa3f31
...
...
@@ -5,6 +5,7 @@ import vllm._custom_ops as ops
from
tests.kernels.quant_utils
import
(
FP8_DTYPE
,
ref_dynamic_per_tensor_fp8_quant
,
ref_dynamic_per_token_quant
)
from
vllm.utils
import
seed_everything
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
1
,
2
,
3
,
4
,
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
...
...
@@ -24,8 +25,7 @@ SEEDS = [0]
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
# avoid nans
...
...
@@ -49,8 +49,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
@
torch
.
inference_mode
()
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
...
...
@@ -67,8 +66,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
...
...
tests/kernels/test_gguf.py
View file @
6ffa3f31
...
...
@@ -7,6 +7,7 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
from
huggingface_hub
import
snapshot_download
import
vllm._custom_ops
as
ops
from
vllm.utils
import
seed_everything
GGUF_SAMPLE
=
snapshot_download
(
"Isotr0py/test-gguf-sample"
)
...
...
@@ -74,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
@
torch
.
inference_mode
()
def
test_mmvq
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
):
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
1
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
...
@@ -110,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype,
@
torch
.
inference_mode
()
def
test_mmq
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
):
torch
.
cuda
.
manual_seed_all
(
0
)
seed_everything
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
num_tokens
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
...
tests/kernels/test_int8_quant.py
View file @
6ffa3f31
...
...
@@ -4,6 +4,7 @@ import torch
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
tests.kernels.utils
import
opcheck
from
vllm._custom_ops
import
scaled_int8_quant
from
vllm.utils
import
seed_everything
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
...
...
@@ -44,8 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
@
torch
.
inference_mode
()
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
...
@@ -68,8 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
@
torch
.
inference_mode
()
def
test_dynamic_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
...
...
@@ -113,8 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
...
@@ -140,8 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
def
test_static_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
,
azp
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
...
...
tests/kernels/test_layernorm.py
View file @
6ffa3f31
...
...
@@ -3,6 +3,7 @@ import torch
from
tests.kernels.utils
import
opcheck
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.utils
import
seed_everything
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
7
,
83
,
4096
]
# Arbitrary values for testing
...
...
@@ -30,9 +31,7 @@ def test_rms_norm(
seed
:
int
,
device
:
str
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
...
...
tests/kernels/test_machete_gemm.py
View file @
6ffa3f31
...
...
@@ -48,7 +48,7 @@ WTYPE_ZEROPOINTS = [
# `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU
=
current_platform
.
get
_device_capability
(
)[
0
]
>=
9
IS_SUPPORTED_BY_GPU
=
current_platform
.
has
_device_capability
(
90
)
def
rand_data
(
shape
,
dtype
=
torch
.
float16
):
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment