Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
622b7ab9
Unverified
Commit
622b7ab9
authored
Oct 29, 2024
by
wangshuai09
Committed by
GitHub
Oct 29, 2024
Browse files
[Hardware] using current_platform.seed_everything (#9785)
Signed-off-by:
wangshuai09
<
391746016@qq.com
>
parent
09500f7d
Changes
27
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
70 additions
and
68 deletions
+70
-68
benchmarks/kernels/benchmark_layernorm.py
benchmarks/kernels/benchmark_layernorm.py
+3
-3
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+4
-3
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_paged_attention.py
+3
-2
benchmarks/kernels/benchmark_quant.py
benchmarks/kernels/benchmark_quant.py
+3
-3
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_rope.py
+3
-2
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+3
-3
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+3
-3
tests/kernels/test_awq_triton.py
tests/kernels/test_awq_triton.py
+3
-3
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_blocksparse_attention.py
+3
-3
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+6
-6
tests/kernels/test_causal_conv1d.py
tests/kernels/test_causal_conv1d.py
+6
-6
tests/kernels/test_flash_attn.py
tests/kernels/test_flash_attn.py
+3
-3
tests/kernels/test_flashinfer.py
tests/kernels/test_flashinfer.py
+5
-5
tests/kernels/test_fp8_quant.py
tests/kernels/test_fp8_quant.py
+4
-4
tests/kernels/test_gguf.py
tests/kernels/test_gguf.py
+3
-3
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+5
-5
tests/kernels/test_layernorm.py
tests/kernels/test_layernorm.py
+2
-2
tests/kernels/test_mamba_ssm.py
tests/kernels/test_mamba_ssm.py
+3
-3
tests/kernels/test_moe.py
tests/kernels/test_moe.py
+1
-2
tests/kernels/test_pos_encoding.py
tests/kernels/test_pos_encoding.py
+4
-4
No files found.
benchmarks/kernels/benchmark_layernorm.py
View file @
622b7ab9
...
@@ -3,8 +3,8 @@ import time
...
@@ -3,8 +3,8 @@ import time
import
torch
import
torch
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.
util
s
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
from
vllm.
platform
s
import
current_platform
seed_everything
)
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
@@ -16,7 +16,7 @@ def main(num_tokens: int,
...
@@ -16,7 +16,7 @@ def main(num_tokens: int,
do_profile
:
bool
=
False
,
do_profile
:
bool
=
False
,
num_warmup_iters
:
int
=
5
,
num_warmup_iters
:
int
=
5
,
num_iters
:
int
=
100
)
->
None
:
num_iters
:
int
=
100
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
...
...
benchmarks/kernels/benchmark_moe.py
View file @
622b7ab9
...
@@ -10,7 +10,8 @@ from ray.experimental.tqdm_ray import tqdm
...
@@ -10,7 +10,8 @@ from ray.experimental.tqdm_ray import tqdm
from
transformers
import
AutoConfig
from
transformers
import
AutoConfig
from
vllm.model_executor.layers.fused_moe.fused_moe
import
*
from
vllm.model_executor.layers.fused_moe.fused_moe
import
*
from
vllm.utils
import
FlexibleArgumentParser
,
seed_everything
from
vllm.platforms
import
current_platform
from
vllm.utils
import
FlexibleArgumentParser
class
BenchmarkConfig
(
TypedDict
):
class
BenchmarkConfig
(
TypedDict
):
...
@@ -167,7 +168,7 @@ class BenchmarkWorker:
...
@@ -167,7 +168,7 @@ class BenchmarkWorker:
def
__init__
(
self
,
seed
:
int
)
->
None
:
def
__init__
(
self
,
seed
:
int
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
self
.
seed
=
seed
self
.
seed
=
seed
def
benchmark
(
def
benchmark
(
...
@@ -181,7 +182,7 @@ class BenchmarkWorker:
...
@@ -181,7 +182,7 @@ class BenchmarkWorker:
use_fp8_w8a8
:
bool
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int8_w8a16
:
bool
,
)
->
Tuple
[
Dict
[
str
,
int
],
float
]:
)
->
Tuple
[
Dict
[
str
,
int
],
float
]:
seed_everything
(
self
.
seed
)
current_platform
.
seed_everything
(
self
.
seed
)
dtype_str
=
get_config_dtype_str
(
dtype
,
dtype_str
=
get_config_dtype_str
(
dtype
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int8_w8a16
=
use_int8_w8a16
,
use_fp8_w8a8
=
use_fp8_w8a8
)
use_fp8_w8a8
=
use_fp8_w8a8
)
...
...
benchmarks/kernels/benchmark_paged_attention.py
View file @
622b7ab9
...
@@ -5,8 +5,9 @@ from typing import List, Optional
...
@@ -5,8 +5,9 @@ from typing import List, Optional
import
torch
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
create_kv_caches_with_random
,
seed_everything
)
create_kv_caches_with_random
)
NUM_BLOCKS
=
1024
NUM_BLOCKS
=
1024
PARTITION_SIZE
=
512
PARTITION_SIZE
=
512
...
@@ -28,7 +29,7 @@ def main(
...
@@ -28,7 +29,7 @@ def main(
device
:
str
=
"cuda"
,
device
:
str
=
"cuda"
,
kv_cache_dtype
:
Optional
[
str
]
=
None
,
kv_cache_dtype
:
Optional
[
str
]
=
None
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
query
=
torch
.
empty
(
num_seqs
,
query
=
torch
.
empty
(
num_seqs
,
...
...
benchmarks/kernels/benchmark_quant.py
View file @
622b7ab9
...
@@ -3,8 +3,8 @@ import time
...
@@ -3,8 +3,8 @@ import time
import
torch
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.
util
s
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
from
vllm.
platform
s
import
current_platform
seed_everything
)
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
@@ -17,7 +17,7 @@ def main(num_tokens: int,
...
@@ -17,7 +17,7 @@ def main(num_tokens: int,
do_profile
:
bool
=
False
,
do_profile
:
bool
=
False
,
num_warmup_iters
:
int
=
5
,
num_warmup_iters
:
int
=
5
,
num_iters
:
int
=
100
)
->
None
:
num_iters
:
int
=
100
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
...
...
benchmarks/kernels/benchmark_rope.py
View file @
622b7ab9
...
@@ -6,7 +6,8 @@ import torch
...
@@ -6,7 +6,8 @@ import torch
from
vllm.model_executor.layers.rotary_embedding
import
(
RotaryEmbedding
,
from
vllm.model_executor.layers.rotary_embedding
import
(
RotaryEmbedding
,
get_rope
)
get_rope
)
from
vllm.utils
import
FlexibleArgumentParser
,
seed_everything
from
vllm.platforms
import
current_platform
from
vllm.utils
import
FlexibleArgumentParser
def
benchmark_rope_kernels_multi_lora
(
def
benchmark_rope_kernels_multi_lora
(
...
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
...
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
max_position
:
int
=
8192
,
max_position
:
int
=
8192
,
base
:
int
=
10000
,
base
:
int
=
10000
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
...
...
tests/kernels/test_activation.py
View file @
622b7ab9
...
@@ -8,7 +8,7 @@ from tests.kernels.utils import opcheck
...
@@ -8,7 +8,7 @@ from tests.kernels.utils import opcheck
from
vllm.model_executor.layers.activation
import
(
FastGELU
,
FatreluAndMul
,
from
vllm.model_executor.layers.activation
import
(
FastGELU
,
FatreluAndMul
,
GeluAndMul
,
NewGELU
,
GeluAndMul
,
NewGELU
,
QuickGELU
,
SiluAndMul
)
QuickGELU
,
SiluAndMul
)
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
from
.allclose_default
import
get_default_atol
,
get_default_rtol
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
@@ -37,7 +37,7 @@ def test_act_and_mul(
...
@@ -37,7 +37,7 @@ def test_act_and_mul(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
)
if
activation
==
"silu"
:
if
activation
==
"silu"
:
...
@@ -85,7 +85,7 @@ def test_activation(
...
@@ -85,7 +85,7 @@ def test_activation(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
layer
=
activation
[
0
]()
layer
=
activation
[
0
]()
...
...
tests/kernels/test_attention.py
View file @
622b7ab9
...
@@ -7,7 +7,7 @@ import torch
...
@@ -7,7 +7,7 @@ import torch
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
get_max_shared_memory_bytes
,
seed_everything
from
vllm.utils
import
get_max_shared_memory_bytes
from
.allclose_default
import
get_default_atol
,
get_default_rtol
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
@@ -144,7 +144,7 @@ def test_paged_attention(
...
@@ -144,7 +144,7 @@ def test_paged_attention(
or
(
version
==
"rocm"
and
head_size
not
in
(
64
,
128
))):
or
(
version
==
"rocm"
and
head_size
not
in
(
64
,
128
))):
pytest
.
skip
()
pytest
.
skip
()
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
num_query_heads
,
num_kv_heads
=
num_heads
...
@@ -382,7 +382,7 @@ def test_multi_query_kv_attention(
...
@@ -382,7 +382,7 @@ def test_multi_query_kv_attention(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
# As the xformers library is already tested with its own tests, we can use
...
...
tests/kernels/test_awq_triton.py
View file @
622b7ab9
...
@@ -7,7 +7,7 @@ import torch
...
@@ -7,7 +7,7 @@ import torch
from
vllm.model_executor.layers.quantization.awq_triton
import
(
from
vllm.model_executor.layers.quantization.awq_triton
import
(
AWQ_TRITON_SUPPORTED_GROUP_SIZES
,
awq_dequantize_triton
,
awq_gemm_triton
)
AWQ_TRITON_SUPPORTED_GROUP_SIZES
,
awq_dequantize_triton
,
awq_gemm_triton
)
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
device
=
"cuda"
device
=
"cuda"
...
@@ -80,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
...
@@ -80,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols
=
qweight_cols
zeros_cols
=
qweight_cols
zeros_dtype
=
torch
.
int32
zeros_dtype
=
torch
.
int32
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
qweight
=
torch
.
randint
(
0
,
qweight
=
torch
.
randint
(
0
,
torch
.
iinfo
(
torch
.
int32
).
max
,
torch
.
iinfo
(
torch
.
int32
).
max
,
...
@@ -134,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
...
@@ -134,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows
=
scales_rows
qzeros_rows
=
scales_rows
qzeros_cols
=
qweight_cols
qzeros_cols
=
qweight_cols
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
input
=
torch
.
rand
((
input_rows
,
input_cols
),
input
=
torch
.
rand
((
input_rows
,
input_cols
),
dtype
=
input_dtype
,
dtype
=
input_dtype
,
...
...
tests/kernels/test_blocksparse_attention.py
View file @
622b7ab9
...
@@ -8,7 +8,7 @@ from vllm import _custom_ops as ops
...
@@ -8,7 +8,7 @@ from vllm import _custom_ops as ops
from
vllm.attention.ops.blocksparse_attention.interface
import
(
from
vllm.attention.ops.blocksparse_attention.interface
import
(
LocalStridedBlockSparseAttn
)
LocalStridedBlockSparseAttn
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
get_max_shared_memory_bytes
,
seed_everything
from
vllm.utils
import
get_max_shared_memory_bytes
from
.allclose_default
import
get_default_atol
,
get_default_rtol
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
@@ -173,7 +173,7 @@ def test_paged_attention(
...
@@ -173,7 +173,7 @@ def test_paged_attention(
blocksparse_block_size
:
int
,
blocksparse_block_size
:
int
,
blocksparse_head_sliding_step
:
int
,
blocksparse_head_sliding_step
:
int
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
num_query_heads
,
num_kv_heads
=
num_heads
...
@@ -384,7 +384,7 @@ def test_varlen_blocksparse_attention_prefill(
...
@@ -384,7 +384,7 @@ def test_varlen_blocksparse_attention_prefill(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
# As the xformers library is already tested with its own tests, we can use
...
...
tests/kernels/test_cache.py
View file @
622b7ab9
...
@@ -6,7 +6,7 @@ import torch
...
@@ -6,7 +6,7 @@ import torch
from
tests.kernels.utils
import
DEFAULT_OPCHECK_TEST_UTILS
,
opcheck
from
tests.kernels.utils
import
DEFAULT_OPCHECK_TEST_UTILS
,
opcheck
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
COPYING_DIRECTION
=
[(
'cuda'
,
'cpu'
),
(
'cuda'
,
'cuda'
),
(
'cpu'
,
'cuda'
)]
COPYING_DIRECTION
=
[(
'cuda'
,
'cpu'
),
(
'cuda'
,
'cuda'
),
(
'cpu'
,
'cuda'
)]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
...
@@ -56,7 +56,7 @@ def test_copy_blocks(
...
@@ -56,7 +56,7 @@ def test_copy_blocks(
)
->
None
:
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
pytest
.
skip
()
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
# Generate random block mappings where each source block is mapped to two
# Generate random block mappings where each source block is mapped to two
# destination blocks.
# destination blocks.
...
@@ -132,7 +132,7 @@ def test_reshape_and_cache(
...
@@ -132,7 +132,7 @@ def test_reshape_and_cache(
)
->
None
:
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
pytest
.
skip
()
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
# Create a random slot mapping.
# Create a random slot mapping.
num_slots
=
block_size
*
num_blocks
num_slots
=
block_size
*
num_blocks
...
@@ -224,7 +224,7 @@ def test_reshape_and_cache_flash(
...
@@ -224,7 +224,7 @@ def test_reshape_and_cache_flash(
device
:
str
,
device
:
str
,
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
# Create a random slot mapping.
# Create a random slot mapping.
...
@@ -339,7 +339,7 @@ def test_swap_blocks(
...
@@ -339,7 +339,7 @@ def test_swap_blocks(
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
pytest
.
skip
()
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
src_device
=
device
if
direction
[
0
]
==
"cuda"
else
'cpu'
src_device
=
device
if
direction
[
0
]
==
"cuda"
else
'cpu'
dst_device
=
device
if
direction
[
1
]
==
"cuda"
else
'cpu'
dst_device
=
device
if
direction
[
1
]
==
"cuda"
else
'cpu'
...
@@ -408,7 +408,7 @@ def test_fp8_e4m3_conversion(
...
@@ -408,7 +408,7 @@ def test_fp8_e4m3_conversion(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
low
=
-
224.0
low
=
-
224.0
high
=
224.0
high
=
224.0
...
...
tests/kernels/test_causal_conv1d.py
View file @
622b7ab9
...
@@ -9,7 +9,7 @@ from vllm import _custom_ops as ops # noqa: F401
...
@@ -9,7 +9,7 @@ from vllm import _custom_ops as ops # noqa: F401
from
vllm.attention.backends.utils
import
PAD_SLOT_ID
from
vllm.attention.backends.utils
import
PAD_SLOT_ID
from
vllm.model_executor.layers.mamba.ops.causal_conv1d
import
(
from
vllm.model_executor.layers.mamba.ops.causal_conv1d
import
(
causal_conv1d_fn
,
causal_conv1d_update
)
causal_conv1d_fn
,
causal_conv1d_update
)
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
def
causal_conv1d_ref
(
def
causal_conv1d_ref
(
...
@@ -161,7 +161,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
...
@@ -161,7 +161,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
if
itype
==
torch
.
bfloat16
:
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
rtol
,
atol
=
1e-2
,
5e-2
# set seed
# set seed
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
x
=
torch
.
randn
(
batch
,
dim
,
seqlen
,
device
=
device
,
x
=
torch
.
randn
(
batch
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
).
contiguous
()
dtype
=
itype
).
contiguous
()
...
@@ -223,7 +223,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
...
@@ -223,7 +223,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
if
itype
==
torch
.
bfloat16
:
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
rtol
,
atol
=
1e-2
,
5e-2
# set seed
# set seed
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
batch
=
2
batch
=
2
x
=
torch
.
randn
(
batch
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
)
x
=
torch
.
randn
(
batch
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
)
x_ref
=
x
.
clone
()
x_ref
=
x
.
clone
()
...
@@ -270,7 +270,7 @@ def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width,
...
@@ -270,7 +270,7 @@ def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width,
rtol
,
atol
=
1e-2
,
5e-2
rtol
,
atol
=
1e-2
,
5e-2
# set seed
# set seed
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
batch_size
=
3
batch_size
=
3
padding
=
5
if
with_padding
else
0
padding
=
5
if
with_padding
else
0
...
@@ -343,7 +343,7 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,
...
@@ -343,7 +343,7 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,
if
itype
==
torch
.
bfloat16
:
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
rtol
,
atol
=
1e-2
,
5e-2
# set seed
# set seed
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
seqlens
=
[]
seqlens
=
[]
batch_size
=
4
batch_size
=
4
if
seqlen
<
10
:
if
seqlen
<
10
:
...
...
tests/kernels/test_flash_attn.py
View file @
622b7ab9
...
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
...
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
import
pytest
import
pytest
import
torch
import
torch
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
from
vllm.vllm_flash_attn
import
(
flash_attn_varlen_func
,
from
vllm.vllm_flash_attn
import
(
flash_attn_varlen_func
,
flash_attn_with_kvcache
)
flash_attn_with_kvcache
)
...
@@ -91,7 +91,7 @@ def test_flash_attn_with_paged_kv(
...
@@ -91,7 +91,7 @@ def test_flash_attn_with_paged_kv(
sliding_window
:
Optional
[
int
],
sliding_window
:
Optional
[
int
],
)
->
None
:
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
num_seqs
=
len
(
kv_lens
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
num_kv_heads
=
num_heads
[
1
]
...
@@ -161,7 +161,7 @@ def test_varlen_with_paged_kv(
...
@@ -161,7 +161,7 @@ def test_varlen_with_paged_kv(
num_blocks
:
int
,
num_blocks
:
int
,
)
->
None
:
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
num_seqs
=
len
(
seq_lens
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
...
...
tests/kernels/test_flashinfer.py
View file @
622b7ab9
...
@@ -4,7 +4,7 @@ import flashinfer
...
@@ -4,7 +4,7 @@ import flashinfer
import
pytest
import
pytest
import
torch
import
torch
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
NUM_HEADS
=
[(
16
,
16
),
(
32
,
8
),
(
64
,
8
),
(
6
,
1
)]
NUM_HEADS
=
[(
16
,
16
),
(
32
,
8
),
(
64
,
8
),
(
6
,
1
)]
HEAD_SIZES
=
[
128
,
256
]
HEAD_SIZES
=
[
128
,
256
]
...
@@ -84,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv(
...
@@ -84,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv(
soft_cap
:
Optional
[
float
],
soft_cap
:
Optional
[
float
],
)
->
None
:
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
num_seqs
=
len
(
kv_lens
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
num_kv_heads
=
num_heads
[
1
]
...
@@ -170,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
...
@@ -170,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
block_size
:
int
,
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
soft_cap
:
Optional
[
float
])
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
num_seqs
=
len
(
seq_lens
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
...
@@ -268,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
...
@@ -268,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
soft_cap
:
Optional
[
float
])
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
num_seqs
=
len
(
seq_lens
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
...
@@ -381,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
...
@@ -381,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
)
->
None
:
)
->
None
:
# test doesn't work for num_heads = (16,16)
# test doesn't work for num_heads = (16,16)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
num_seqs
=
len
(
kv_lens
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
num_kv_heads
=
num_heads
[
1
]
...
...
tests/kernels/test_fp8_quant.py
View file @
622b7ab9
...
@@ -6,7 +6,7 @@ from tests.kernels.quant_utils import (FP8_DTYPE,
...
@@ -6,7 +6,7 @@ from tests.kernels.quant_utils import (FP8_DTYPE,
ref_dynamic_per_tensor_fp8_quant
,
ref_dynamic_per_tensor_fp8_quant
,
ref_dynamic_per_token_quant
)
ref_dynamic_per_token_quant
)
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
1
,
2
,
3
,
4
,
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
HIDDEN_SIZES
=
[
1
,
2
,
3
,
4
,
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
...
@@ -46,7 +46,7 @@ def opcheck_fp8_quant(output,
...
@@ -46,7 +46,7 @@ def opcheck_fp8_quant(output,
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
)
->
None
:
seed
:
int
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
# avoid nans
device
=
"cuda"
)
+
1e-6
# avoid nans
...
@@ -76,7 +76,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
...
@@ -76,7 +76,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
...
@@ -95,7 +95,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
...
@@ -95,7 +95,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
...
...
tests/kernels/test_gguf.py
View file @
622b7ab9
...
@@ -7,7 +7,7 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
...
@@ -7,7 +7,7 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
import
vllm._custom_ops
as
ops
import
vllm._custom_ops
as
ops
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
GGUF_SAMPLE
=
snapshot_download
(
"Isotr0py/test-gguf-sample"
)
GGUF_SAMPLE
=
snapshot_download
(
"Isotr0py/test-gguf-sample"
)
...
@@ -75,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
...
@@ -75,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_mmvq
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
def
test_mmvq
(
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
):
quant_type
:
GGMLQuantizationType
):
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
1
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
x
=
torch
.
rand
((
1
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
@@ -111,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype,
...
@@ -111,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_mmq
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
def
test_mmq
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
quant_type
:
GGMLQuantizationType
):
quant_type
:
GGMLQuantizationType
):
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
tensors
=
get_gguf_sample_tensors
(
hidden_size
,
quant_type
)
x
=
torch
.
rand
((
num_tokens
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
x
=
torch
.
rand
((
num_tokens
,
hidden_size
),
dtype
=
dtype
,
device
=
"cuda"
)
...
...
tests/kernels/test_int8_quant.py
View file @
622b7ab9
...
@@ -4,7 +4,7 @@ import torch
...
@@ -4,7 +4,7 @@ import torch
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm._custom_ops
import
scaled_int8_quant
from
vllm._custom_ops
import
scaled_int8_quant
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
...
@@ -45,7 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
...
@@ -45,7 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
@@ -68,7 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
...
@@ -68,7 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_dynamic_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
def
test_dynamic_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
...
@@ -112,7 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
...
@@ -112,7 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
scale
:
float
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
...
@@ -138,7 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
...
@@ -138,7 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
def
test_static_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
def
test_static_scaled_int8_azp_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
,
azp
:
int
)
->
None
:
scale
:
float
,
azp
:
int
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
...
...
tests/kernels/test_layernorm.py
View file @
622b7ab9
...
@@ -3,7 +3,7 @@ import torch
...
@@ -3,7 +3,7 @@ import torch
from
tests.kernels.utils
import
opcheck
from
tests.kernels.utils
import
opcheck
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
7
,
83
,
4096
]
# Arbitrary values for testing
NUM_TOKENS
=
[
7
,
83
,
4096
]
# Arbitrary values for testing
...
@@ -31,7 +31,7 @@ def test_rms_norm(
...
@@ -31,7 +31,7 @@ def test_rms_norm(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
...
...
tests/kernels/test_mamba_ssm.py
View file @
622b7ab9
...
@@ -8,7 +8,7 @@ from vllm import _custom_ops as ops # noqa: F401
...
@@ -8,7 +8,7 @@ from vllm import _custom_ops as ops # noqa: F401
from
vllm.attention.backends.utils
import
PAD_SLOT_ID
from
vllm.attention.backends.utils
import
PAD_SLOT_ID
from
vllm.model_executor.layers.mamba.ops.mamba_ssm
import
(
from
vllm.model_executor.layers.mamba.ops.mamba_ssm
import
(
selective_scan_fn
,
selective_state_update
)
selective_scan_fn
,
selective_state_update
)
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
def
selective_state_update_ref
(
state
,
def
selective_state_update_ref
(
state
,
...
@@ -235,7 +235,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
...
@@ -235,7 +235,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
rtolw
=
max
(
rtolw
,
rtol
)
rtolw
=
max
(
rtolw
,
rtol
)
atolw
=
max
(
atolw
,
atol
)
atolw
=
max
(
atolw
,
atol
)
# set seed
# set seed
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
batch_size
=
1
batch_size
=
1
dim
=
4
dim
=
4
dstate
=
8
dstate
=
8
...
@@ -358,7 +358,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
...
@@ -358,7 +358,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
if
torch
.
version
.
hip
:
if
torch
.
version
.
hip
:
atol
*=
2
atol
*=
2
# set seed
# set seed
seed_everything
(
0
)
current_platform
.
seed_everything
(
0
)
batch_size
=
1
batch_size
=
1
state
=
torch
.
randn
(
batch_size
,
dim
,
dstate
,
dtype
=
itype
,
device
=
device
)
state
=
torch
.
randn
(
batch_size
,
dim
,
dstate
,
dtype
=
itype
,
device
=
device
)
x
=
torch
.
randn
(
batch_size
,
dim
,
device
=
device
,
dtype
=
itype
)
x
=
torch
.
randn
(
batch_size
,
dim
,
device
=
device
,
dtype
=
itype
)
...
...
tests/kernels/test_moe.py
View file @
622b7ab9
...
@@ -19,7 +19,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
...
@@ -19,7 +19,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
from
vllm.model_executor.models.mixtral
import
MixtralMoE
from
vllm.model_executor.models.mixtral
import
MixtralMoE
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.scalar_type
import
scalar_types
from
vllm.utils
import
seed_everything
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1024
*
128
,
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1024
*
128
,
512
,
222
,
33
,
1
])
...
@@ -115,7 +114,7 @@ def test_fused_marlin_moe(
...
@@ -115,7 +114,7 @@ def test_fused_marlin_moe(
num_bits
:
int
,
num_bits
:
int
,
is_k_full
:
bool
,
is_k_full
:
bool
,
):
):
seed_everything
(
7
)
current_platform
.
seed_everything
(
7
)
# Filter act_order
# Filter act_order
if
act_order
:
if
act_order
:
...
...
tests/kernels/test_pos_encoding.py
View file @
622b7ab9
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
import
torch
import
torch
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.
util
s
import
seed_everything
from
vllm.
platform
s
import
current_platform
from
.allclose_default
import
get_default_atol
,
get_default_rtol
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
@@ -48,7 +48,7 @@ def test_rotary_embedding(
...
@@ -48,7 +48,7 @@ def test_rotary_embedding(
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
...
@@ -100,7 +100,7 @@ def test_batched_rotary_embedding(
...
@@ -100,7 +100,7 @@ def test_batched_rotary_embedding(
max_position
:
int
=
8192
,
max_position
:
int
=
8192
,
base
:
int
=
10000
,
base
:
int
=
10000
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
...
@@ -160,7 +160,7 @@ def test_batched_rotary_embedding_multi_lora(
...
@@ -160,7 +160,7 @@ def test_batched_rotary_embedding_multi_lora(
max_position
:
int
=
8192
,
max_position
:
int
=
8192
,
base
:
int
=
10000
,
base
:
int
=
10000
,
)
->
None
:
)
->
None
:
seed_everything
(
seed
)
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment