Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bb4337b3
Unverified
Commit
bb4337b3
authored
Jan 05, 2026
by
wangxiyuan
Committed by
GitHub
Jan 04, 2026
Browse files
[Platform] Deprecate seed_everything (#31659)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
367856de
Changes
77
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
54 additions
and
44 deletions
+54
-44
tests/kernels/core/test_pos_encoding.py
tests/kernels/core/test_pos_encoding.py
+2
-2
tests/kernels/mamba/test_causal_conv1d.py
tests/kernels/mamba/test_causal_conv1d.py
+4
-4
tests/kernels/mamba/test_mamba_mixer2.py
tests/kernels/mamba/test_mamba_mixer2.py
+2
-2
tests/kernels/mamba/test_mamba_ssm.py
tests/kernels/mamba/test_mamba_ssm.py
+6
-6
tests/kernels/mamba/test_mamba_ssm_ssd.py
tests/kernels/mamba/test_mamba_ssm_ssd.py
+2
-2
tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
...s/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+2
-2
tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
...ernels/moe/modular_kernel_tools/profile_modular_kernel.py
+2
-2
tests/kernels/moe/test_batched_moe.py
tests/kernels/moe/test_batched_moe.py
+3
-2
tests/kernels/moe/test_cpu_fused_moe.py
tests/kernels/moe/test_cpu_fused_moe.py
+2
-1
tests/kernels/moe/test_cutlass_moe.py
tests/kernels/moe/test_cutlass_moe.py
+4
-3
tests/kernels/moe/test_deepep_deepgemm_moe.py
tests/kernels/moe/test_deepep_deepgemm_moe.py
+4
-4
tests/kernels/moe/test_deepep_moe.py
tests/kernels/moe/test_deepep_moe.py
+3
-3
tests/kernels/moe/test_flashinfer.py
tests/kernels/moe/test_flashinfer.py
+3
-2
tests/kernels/moe/test_flashinfer_moe.py
tests/kernels/moe/test_flashinfer_moe.py
+2
-1
tests/kernels/moe/test_grouped_topk.py
tests/kernels/moe/test_grouped_topk.py
+2
-1
tests/kernels/moe/test_modular_kernel_combinations.py
tests/kernels/moe/test_modular_kernel_combinations.py
+2
-2
tests/kernels/moe/test_modular_oai_triton_moe.py
tests/kernels/moe/test_modular_oai_triton_moe.py
+2
-1
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+2
-1
tests/kernels/moe/test_moe_align_block_size.py
tests/kernels/moe/test_moe_align_block_size.py
+2
-1
tests/kernels/moe/test_moe_permute_unpermute.py
tests/kernels/moe/test_moe_permute_unpermute.py
+3
-2
No files found.
tests/kernels/core/test_pos_encoding.py
View file @
bb4337b3
...
@@ -9,7 +9,7 @@ import torch
...
@@ -9,7 +9,7 @@ import torch
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
IS_NEOX_STYLE
=
[
True
,
False
]
IS_NEOX_STYLE
=
[
True
,
False
]
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float
]
...
@@ -79,7 +79,7 @@ def test_rotary_embedding(
...
@@ -79,7 +79,7 @@ def test_rotary_embedding(
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
...
...
tests/kernels/mamba/test_causal_conv1d.py
View file @
bb4337b3
...
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
...
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn
,
causal_conv1d_fn
,
causal_conv1d_update
,
causal_conv1d_update
,
)
)
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
def
causal_conv1d_ref
(
def
causal_conv1d_ref
(
...
@@ -154,7 +154,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity
...
@@ -154,7 +154,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity
if
itype
==
torch
.
bfloat16
:
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
rtol
,
atol
=
1e-2
,
5e-2
# set seed
# set seed
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
batch
=
2
batch
=
2
x
=
torch
.
randn
(
batch
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
)
x
=
torch
.
randn
(
batch
,
dim
,
seqlen
,
device
=
device
,
dtype
=
itype
)
x_ref
=
x
.
clone
()
x_ref
=
x
.
clone
()
...
@@ -201,7 +201,7 @@ def test_causal_conv1d_update_with_batch_gather(
...
@@ -201,7 +201,7 @@ def test_causal_conv1d_update_with_batch_gather(
rtol
,
atol
=
1e-2
,
5e-2
rtol
,
atol
=
1e-2
,
5e-2
# set seed
# set seed
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
padding
=
5
if
with_padding
else
0
padding
=
5
if
with_padding
else
0
padded_batch_size
=
batch_size
+
padding
padded_batch_size
=
batch_size
+
padding
...
@@ -278,7 +278,7 @@ def test_causal_conv1d_varlen(
...
@@ -278,7 +278,7 @@ def test_causal_conv1d_varlen(
if
itype
==
torch
.
bfloat16
:
if
itype
==
torch
.
bfloat16
:
rtol
,
atol
=
1e-2
,
5e-2
rtol
,
atol
=
1e-2
,
5e-2
# set seed
# set seed
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
seqlens
=
[]
seqlens
=
[]
batch_size
=
batch
batch_size
=
batch
padding
=
3
if
with_padding
else
0
padding
=
3
if
with_padding
else
0
...
...
tests/kernels/mamba/test_mamba_mixer2.py
View file @
bb4337b3
...
@@ -12,8 +12,8 @@ from vllm.distributed.parallel_state import (
...
@@ -12,8 +12,8 @@ from vllm.distributed.parallel_state import (
initialize_model_parallel
,
initialize_model_parallel
,
)
)
from
vllm.model_executor.layers.mamba.mamba_mixer2
import
Mixer2RMSNormGated
from
vllm.model_executor.layers.mamba.mamba_mixer2
import
Mixer2RMSNormGated
from
vllm.platforms
import
current_platform
from
vllm.utils.system_utils
import
update_environment_variables
from
vllm.utils.system_utils
import
update_environment_variables
from
vllm.utils.torch_utils
import
set_random_seed
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
...
@@ -68,7 +68,7 @@ def mixer2_gated_norm_tensor_parallel(
...
@@ -68,7 +68,7 @@ def mixer2_gated_norm_tensor_parallel(
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
device
:
str
,
device
:
str
,
):
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
...
...
tests/kernels/mamba/test_mamba_ssm.py
View file @
bb4337b3
...
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
...
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
selective_scan_fn
,
selective_scan_fn
,
selective_state_update
,
selective_state_update
,
)
)
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
def
selective_state_update_ref
(
def
selective_state_update_ref
(
...
@@ -271,7 +271,7 @@ def test_selective_scan(
...
@@ -271,7 +271,7 @@ def test_selective_scan(
rtolw
=
max
(
rtolw
,
rtol
)
rtolw
=
max
(
rtolw
,
rtol
)
atolw
=
max
(
atolw
,
atol
)
atolw
=
max
(
atolw
,
atol
)
# set seed
# set seed
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
batch_size
=
1
batch_size
=
1
dim
=
4
dim
=
4
dstate
=
8
dstate
=
8
...
@@ -401,7 +401,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
...
@@ -401,7 +401,7 @@ def test_selective_state_update(dim, dstate, has_z, itype):
if
torch
.
version
.
hip
:
if
torch
.
version
.
hip
:
atol
*=
2
atol
*=
2
# set seed
# set seed
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
batch_size
=
1
batch_size
=
1
state
=
torch
.
randn
(
batch_size
,
dim
,
dstate
,
dtype
=
itype
,
device
=
device
)
state
=
torch
.
randn
(
batch_size
,
dim
,
dstate
,
dtype
=
itype
,
device
=
device
)
x
=
torch
.
randn
(
batch_size
,
dim
,
device
=
device
,
dtype
=
itype
)
x
=
torch
.
randn
(
batch_size
,
dim
,
device
=
device
,
dtype
=
itype
)
...
@@ -438,7 +438,7 @@ def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len):
...
@@ -438,7 +438,7 @@ def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len):
if
torch
.
version
.
hip
:
if
torch
.
version
.
hip
:
atol
*=
2
atol
*=
2
# set seed
# set seed
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
batch_size
=
4
batch_size
=
4
token_counts
=
torch
.
randint
(
1
,
max_seq_len
+
1
,
(
batch_size
,),
device
=
device
)
token_counts
=
torch
.
randint
(
1
,
max_seq_len
+
1
,
(
batch_size
,),
device
=
device
)
total_tokens
=
int
(
token_counts
.
sum
().
item
())
total_tokens
=
int
(
token_counts
.
sum
().
item
())
...
@@ -857,7 +857,7 @@ def test_selective_state_update_with_num_accepted_tokens(
...
@@ -857,7 +857,7 @@ def test_selective_state_update_with_num_accepted_tokens(
if
torch
.
version
.
hip
:
if
torch
.
version
.
hip
:
atol
*=
2
atol
*=
2
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
batch_size
=
4
batch_size
=
4
tokens_per_seq
=
torch
.
randint
(
1
,
max_seq_len
+
1
,
(
batch_size
,),
device
=
device
)
tokens_per_seq
=
torch
.
randint
(
1
,
max_seq_len
+
1
,
(
batch_size
,),
device
=
device
)
...
@@ -983,7 +983,7 @@ def test_selective_state_update_varlen_with_num_accepted(
...
@@ -983,7 +983,7 @@ def test_selective_state_update_varlen_with_num_accepted(
if
torch
.
version
.
hip
:
if
torch
.
version
.
hip
:
atol
*=
2
atol
*=
2
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
batch_size
=
4
batch_size
=
4
tokens_per_seq
=
torch
.
randint
(
1
,
max_seq_len
+
1
,
(
batch_size
,),
device
=
device
)
tokens_per_seq
=
torch
.
randint
(
1
,
max_seq_len
+
1
,
(
batch_size
,),
device
=
device
)
...
...
tests/kernels/mamba/test_mamba_ssm_ssd.py
View file @
bb4337b3
...
@@ -9,7 +9,7 @@ from einops import rearrange, repeat
...
@@ -9,7 +9,7 @@ from einops import rearrange, repeat
from
vllm.model_executor.layers.mamba.ops.ssd_combined
import
(
from
vllm.model_executor.layers.mamba.ops.ssd_combined
import
(
mamba_chunk_scan_combined_varlen
,
mamba_chunk_scan_combined_varlen
,
)
)
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
from
vllm.v1.attention.backends.mamba2_attn
import
compute_varlen_chunk_metadata
from
vllm.v1.attention.backends.mamba2_attn
import
compute_varlen_chunk_metadata
# Added by the IBM Team, 2024
# Added by the IBM Team, 2024
...
@@ -82,7 +82,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
...
@@ -82,7 +82,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
def
generate_random_inputs
(
batch_size
,
seqlen
,
n_heads
,
d_head
,
itype
,
device
=
"cuda"
):
def
generate_random_inputs
(
batch_size
,
seqlen
,
n_heads
,
d_head
,
itype
,
device
=
"cuda"
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
A
=
-
torch
.
exp
(
torch
.
rand
(
n_heads
,
dtype
=
itype
,
device
=
device
))
A
=
-
torch
.
exp
(
torch
.
rand
(
n_heads
,
dtype
=
itype
,
device
=
device
))
dt
=
F
.
softplus
(
dt
=
F
.
softplus
(
torch
.
randn
(
batch_size
,
seqlen
,
n_heads
,
dtype
=
itype
,
device
=
device
)
-
4
torch
.
randn
(
batch_size
,
seqlen
,
n_heads
,
dtype
=
itype
,
device
=
device
)
-
4
...
...
tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
View file @
bb4337b3
...
@@ -10,7 +10,7 @@ from tqdm import tqdm
...
@@ -10,7 +10,7 @@ from tqdm import tqdm
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.config
import
FUSED_MOE_UNQUANTIZED_CONFIG
from
vllm.model_executor.layers.fused_moe.config
import
FUSED_MOE_UNQUANTIZED_CONFIG
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
from
.common
import
(
from
.common
import
(
Config
,
Config
,
...
@@ -40,7 +40,7 @@ def rank_worker(
...
@@ -40,7 +40,7 @@ def rank_worker(
config
:
Config
,
config
:
Config
,
weights
:
WeightTensors
,
weights
:
WeightTensors
,
):
):
current_platform
.
seed_everything
(
pgi
.
rank
)
set_random_seed
(
pgi
.
rank
)
# sanity check
# sanity check
from
vllm
import
envs
from
vllm
import
envs
...
...
tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
View file @
bb4337b3
...
@@ -9,7 +9,7 @@ from typing import Any
...
@@ -9,7 +9,7 @@ from typing import Any
import
torch
import
torch
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
from
.common
import
Config
,
RankTensors
,
WeightTensors
,
make_modular_kernel
from
.common
import
Config
,
RankTensors
,
WeightTensors
,
make_modular_kernel
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch_with_config
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch_with_config
...
@@ -82,7 +82,7 @@ def rank_worker(
...
@@ -82,7 +82,7 @@ def rank_worker(
config
:
Config
,
config
:
Config
,
weights
:
WeightTensors
,
weights
:
WeightTensors
,
):
):
current_platform
.
seed_everything
(
pgi
.
rank
)
set_random_seed
(
pgi
.
rank
)
# sanity check
# sanity check
from
vllm
import
envs
from
vllm
import
envs
...
...
tests/kernels/moe/test_batched_moe.py
View file @
bb4337b3
...
@@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
...
@@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_topk
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_topk
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
tl
from
vllm.triton_utils
import
tl
from
vllm.utils.torch_utils
import
set_random_seed
MNK_FACTORS
=
[
MNK_FACTORS
=
[
(
1
,
128
,
128
),
(
1
,
128
,
128
),
...
@@ -115,7 +116,7 @@ def test_batched_mm(
...
@@ -115,7 +116,7 @@ def test_batched_mm(
):
):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware."""
and those tests will be skipped on unsupported hardware."""
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
use_fp8_w8a8
=
dtype
==
torch
.
float8_e4m3fn
use_fp8_w8a8
=
dtype
==
torch
.
float8_e4m3fn
...
@@ -252,7 +253,7 @@ def test_fused_moe_batched_experts(
...
@@ -252,7 +253,7 @@ def test_fused_moe_batched_experts(
):
):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware."""
and those tests will be skipped on unsupported hardware."""
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
use_fp8_w8a8
=
dtype
==
torch
.
float8_e4m3fn
use_fp8_w8a8
=
dtype
==
torch
.
float8_e4m3fn
...
...
tests/kernels/moe/test_cpu_fused_moe.py
View file @
bb4337b3
...
@@ -8,6 +8,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol
...
@@ -8,6 +8,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from
vllm._custom_ops
import
cpu_fused_moe
,
cpu_prepack_moe_weight
from
vllm._custom_ops
import
cpu_fused_moe
,
cpu_prepack_moe_weight
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
SwigluOAIAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
SwigluOAIAndMul
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
if
not
current_platform
.
is_cpu
():
if
not
current_platform
.
is_cpu
():
pytest
.
skip
(
"skipping CPU-only tests"
,
allow_module_level
=
True
)
pytest
.
skip
(
"skipping CPU-only tests"
,
allow_module_level
=
True
)
...
@@ -114,7 +115,7 @@ def test_cpu_fused_moe(
...
@@ -114,7 +115,7 @@ def test_cpu_fused_moe(
act
:
str
,
act
:
str
,
isa
:
str
,
isa
:
str
,
):
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
topk_num
=
max
(
expert_num
//
2
,
1
)
topk_num
=
max
(
expert_num
//
2
,
1
)
up_dim
=
2
*
intermediate_size
up_dim
=
2
*
intermediate_size
...
...
tests/kernels/moe/test_cutlass_moe.py
View file @
bb4337b3
...
@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
...
@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_experts
,
fused_topk
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_experts
,
fused_topk
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
NUM_EXPERTS
=
[
40
,
64
]
NUM_EXPERTS
=
[
40
,
64
]
TOP_KS
=
[
6
,
8
]
TOP_KS
=
[
6
,
8
]
...
@@ -277,7 +278,7 @@ def test_cutlass_moe_8_bit_no_graph(
...
@@ -277,7 +278,7 @@ def test_cutlass_moe_8_bit_no_graph(
workspace_init
,
workspace_init
,
ep_size
:
int
|
None
=
None
,
ep_size
:
int
|
None
=
None
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
per_act_token
,
per_out_ch
)
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
per_act_token
,
per_out_ch
)
...
@@ -332,7 +333,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
...
@@ -332,7 +333,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
monkeypatch
,
monkeypatch
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
dtype
=
torch
.
half
dtype
=
torch
.
half
...
@@ -469,7 +470,7 @@ def test_run_cutlass_moe_fp8(
...
@@ -469,7 +470,7 @@ def test_run_cutlass_moe_fp8(
ep_size
:
int
,
ep_size
:
int
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
mt
=
MOETensors8Bit
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
per_act_token
,
per_out_channel
m
,
k
,
n
,
e
,
per_act_token
,
per_out_channel
...
...
tests/kernels/moe/test_deepep_deepgemm_moe.py
View file @
bb4337b3
...
@@ -22,13 +22,13 @@ from vllm.model_executor.layers.fused_moe.config import (
...
@@ -22,13 +22,13 @@ from vllm.model_executor.layers.fused_moe.config import (
)
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.platforms
import
current_platform
from
vllm.utils.deep_gemm
import
(
from
vllm.utils.deep_gemm
import
(
get_mk_alignment_for_contiguous_layout
,
get_mk_alignment_for_contiguous_layout
,
is_deep_gemm_e8m0_used
,
is_deep_gemm_e8m0_used
,
is_deep_gemm_supported
,
is_deep_gemm_supported
,
)
)
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
...utils
import
multi_gpu_test
from
...utils
import
multi_gpu_test
...
@@ -367,7 +367,7 @@ def _test_deepep_deepgemm_moe(
...
@@ -367,7 +367,7 @@ def _test_deepep_deepgemm_moe(
device
=
torch
.
device
(
f
"cuda:
{
pgi
.
local_rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
pgi
.
local_rank
}
"
)
init_workspace_manager
(
device
)
init_workspace_manager
(
device
)
current_platform
.
seed_everything
(
pgi
.
rank
)
set_random_seed
(
pgi
.
rank
)
w1
=
w1
.
to
(
device
=
torch
.
cuda
.
current_device
())
w1
=
w1
.
to
(
device
=
torch
.
cuda
.
current_device
())
w2
=
w2
.
to
(
device
=
torch
.
cuda
.
current_device
())
w2
=
w2
.
to
(
device
=
torch
.
cuda
.
current_device
())
...
@@ -456,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
...
@@ -456,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
"""
"""
m
,
n
,
k
=
mnk
m
,
n
,
k
=
mnk
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
if
topk
>
num_experts
:
if
topk
>
num_experts
:
pytest
.
skip
(
f
"Skipping test: topk=
{
topk
}
> E=
{
num_experts
}
"
)
pytest
.
skip
(
f
"Skipping test: topk=
{
topk
}
> E=
{
num_experts
}
"
)
...
@@ -531,7 +531,7 @@ def test_ll_deepep_deepgemm_moe(
...
@@ -531,7 +531,7 @@ def test_ll_deepep_deepgemm_moe(
assert
not
is_deep_gemm_e8m0_used
()
assert
not
is_deep_gemm_e8m0_used
()
m
,
n
,
k
=
mnk
m
,
n
,
k
=
mnk
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
if
topk
>
num_experts
:
if
topk
>
num_experts
:
pytest
.
skip
(
f
"Skipping test: topk=
{
topk
}
> E=
{
num_experts
}
"
)
pytest
.
skip
(
f
"Skipping test: topk=
{
topk
}
> E=
{
num_experts
}
"
)
...
...
tests/kernels/moe/test_deepep_moe.py
View file @
bb4337b3
...
@@ -20,8 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
...
@@ -20,8 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
per_token_group_quant_fp8
,
per_token_group_quant_fp8
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.utils.import_utils
import
has_deep_ep
from
vllm.utils.import_utils
import
has_deep_ep
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
...utils
import
multi_gpu_test
from
...utils
import
multi_gpu_test
...
@@ -446,7 +446,7 @@ def test_deep_ep_moe(
...
@@ -446,7 +446,7 @@ def test_deep_ep_moe(
low_latency_mode
=
False
low_latency_mode
=
False
use_fp8_dispatch
=
False
use_fp8_dispatch
=
False
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
world_size
,
dp_size
=
world_dp_size
world_size
,
dp_size
=
world_dp_size
config
=
TestConfig
(
dtype
=
dtype
,
topk
=
topk
,
m
=
m
,
k
=
k
,
n
=
n
,
num_experts
=
num_experts
)
config
=
TestConfig
(
dtype
=
dtype
,
topk
=
topk
,
m
=
m
,
k
=
k
,
n
=
n
,
num_experts
=
num_experts
)
...
@@ -507,7 +507,7 @@ def test_low_latency_deep_ep_moe(
...
@@ -507,7 +507,7 @@ def test_low_latency_deep_ep_moe(
f
"hidden sizes
{
DeepEPLLPrepareAndFinalize
.
SUPPORTED_HIDDEN_SIZES
}
"
f
"hidden sizes
{
DeepEPLLPrepareAndFinalize
.
SUPPORTED_HIDDEN_SIZES
}
"
)
)
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
world_size
,
dp_size
=
world_dp_size
world_size
,
dp_size
=
world_dp_size
config
=
TestConfig
(
dtype
=
dtype
,
topk
=
topk
,
m
=
m
,
k
=
k
,
n
=
n
,
num_experts
=
num_experts
)
config
=
TestConfig
(
dtype
=
dtype
,
topk
=
topk
,
m
=
m
,
k
=
k
,
n
=
n
,
num_experts
=
num_experts
)
...
...
tests/kernels/moe/test_flashinfer.py
View file @
bb4337b3
...
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
...
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
input_to_float8
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
input_to_float8
from
vllm.model_executor.models.llama4
import
Llama4MoE
from
vllm.model_executor.models.llama4
import
Llama4MoE
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
try
:
try
:
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
...
@@ -158,7 +159,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
...
@@ -158,7 +159,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
):
):
if
not
current_platform
.
has_device_capability
(
100
):
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
"Test is only supported for sm >= 100"
)
pytest
.
skip
(
"Test is only supported for sm >= 100"
)
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
td
=
TestData
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
reorder
=
True
)
td
=
TestData
.
make_moe_tensors_8bit
(
m
,
k
,
n
,
e
,
reorder
=
True
)
...
@@ -222,7 +223,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
...
@@ -222,7 +223,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
monkeypatch
,
monkeypatch
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
td
=
TestData
.
make_moe_tensors_8bit
(
td
=
TestData
.
make_moe_tensors_8bit
(
...
...
tests/kernels/moe/test_flashinfer_moe.py
View file @
bb4337b3
...
@@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
...
@@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.torch_utils
import
set_random_seed
if
not
has_flashinfer_cutlass_fused_moe
()
or
not
current_platform
.
has_device_capability
(
if
not
has_flashinfer_cutlass_fused_moe
()
or
not
current_platform
.
has_device_capability
(
100
100
...
@@ -60,7 +61,7 @@ def test_flashinfer_fp4_moe_no_graph(
...
@@ -60,7 +61,7 @@ def test_flashinfer_fp4_moe_no_graph(
activation
:
str
,
activation
:
str
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
with
set_current_vllm_config
(
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
):
):
...
...
tests/kernels/moe/test_grouped_topk.py
View file @
bb4337b3
...
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
...
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_grouped_topk
,
fused_grouped_topk
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
...
@@ -52,7 +53,7 @@ def test_grouped_topk(
...
@@ -52,7 +53,7 @@ def test_grouped_topk(
)
)
get_cached_compilation_config
.
cache_clear
()
get_cached_compilation_config
.
cache_clear
()
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
hidden_states
=
torch
.
randn
((
n_token
,
n_hidden
),
dtype
=
dtype
,
device
=
"cuda"
)
hidden_states
=
torch
.
randn
((
n_token
,
n_hidden
),
dtype
=
dtype
,
device
=
"cuda"
)
gating_output
=
torch
.
randn
((
n_token
,
n_expert
),
dtype
=
dtype
,
device
=
"cuda"
)
gating_output
=
torch
.
randn
((
n_token
,
n_expert
),
dtype
=
dtype
,
device
=
"cuda"
)
e_score_correction_bias
=
torch
.
randn
(
e_score_correction_bias
=
torch
.
randn
(
...
...
tests/kernels/moe/test_modular_kernel_combinations.py
View file @
bb4337b3
...
@@ -15,7 +15,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
...
@@ -15,7 +15,7 @@ from vllm.config import VllmConfig, set_current_vllm_config
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
,
has_pplx
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
,
has_pplx
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
,
set_random_seed
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
.modular_kernel_tools.common
import
(
from
.modular_kernel_tools.common
import
(
...
@@ -82,7 +82,7 @@ def rank_worker(
...
@@ -82,7 +82,7 @@ def rank_worker(
device
=
torch
.
device
(
f
"cuda:
{
pgi
.
local_rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
pgi
.
local_rank
}
"
)
init_workspace_manager
(
device
)
init_workspace_manager
(
device
)
current_platform
.
seed_everything
(
pgi
.
rank
)
set_random_seed
(
pgi
.
rank
)
# sanity check
# sanity check
from
vllm
import
envs
from
vllm
import
envs
...
...
tests/kernels/moe/test_modular_oai_triton_moe.py
View file @
bb4337b3
...
@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
...
@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
)
)
from
vllm.model_executor.layers.utils
import
shuffle_weight
from
vllm.model_executor.layers.utils
import
shuffle_weight
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
MNK
=
[
MNK
=
[
(
1
,
512
,
384
),
(
1
,
512
,
384
),
...
@@ -211,7 +212,7 @@ def test_oai_triton_moe(
...
@@ -211,7 +212,7 @@ def test_oai_triton_moe(
unfused
:
bool
,
unfused
:
bool
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
(
(
w1
,
w1
,
w2
,
w2
,
...
...
tests/kernels/moe/test_moe.py
View file @
bb4337b3
...
@@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
...
@@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from
vllm.model_executor.models.mixtral
import
MixtralMoE
from
vllm.model_executor.models.mixtral
import
MixtralMoE
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
ScalarType
,
scalar_types
from
vllm.scalar_type
import
ScalarType
,
scalar_types
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
vllm.v1.worker.workspace
import
init_workspace_manager
NUM_EXPERTS
=
[
8
,
64
,
192
]
NUM_EXPERTS
=
[
8
,
64
,
192
]
...
@@ -234,7 +235,7 @@ def test_fused_moe(
...
@@ -234,7 +235,7 @@ def test_fused_moe(
monkeypatch
,
monkeypatch
,
workspace_init
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
set_random_seed
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
chunk_size
))
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
str
(
chunk_size
))
...
...
tests/kernels/moe/test_moe_align_block_size.py
View file @
bb4337b3
...
@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
...
@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.torch_utils
import
set_random_seed
NUM_TOKENS
=
[
1
,
3
,
256
,
2256
,
4096
]
NUM_TOKENS
=
[
1
,
3
,
256
,
2256
,
4096
]
NUM_EXPERTS
=
[
32
,
160
,
256
,
257
]
NUM_EXPERTS
=
[
32
,
160
,
256
,
257
]
TOP_KS
=
[
1
,
2
,
16
,
32
]
TOP_KS
=
[
1
,
2
,
16
,
32
]
BLOCK_SIZES
=
[
32
,
128
]
BLOCK_SIZES
=
[
32
,
128
]
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
def
_group_tokens_by_expert
(
def
_group_tokens_by_expert
(
...
...
tests/kernels/moe/test_moe_permute_unpermute.py
View file @
bb4337b3
...
@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
...
@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_unpermute
,
moe_unpermute
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
NUM_EXPERTS
=
[
16
,
64
,
256
]
NUM_EXPERTS
=
[
16
,
64
,
256
]
TOP_KS
=
[
2
,
6
,
8
]
TOP_KS
=
[
2
,
6
,
8
]
EP_SIZE
=
[
1
,
4
,
16
]
EP_SIZE
=
[
1
,
4
,
16
]
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
pytest
.
skip
(
pytest
.
skip
(
...
@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
...
@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
n_local_expert
,
expert_map
,
_
=
determine_expert_map
(
ep_size
,
ep_rank
,
n_expert
)
n_local_expert
,
expert_map
,
_
=
determine_expert_map
(
ep_size
,
ep_rank
,
n_expert
)
expert_map
=
expert_map
.
cuda
()
expert_map
=
expert_map
.
cuda
()
start_expert
=
n_local_expert
*
ep_rank
start_expert
=
n_local_expert
*
ep_rank
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
hidden_states
=
torch
.
randn
((
n_token
,
n_hidden
),
device
=
"cuda"
).
to
(
dtype
)
hidden_states
=
torch
.
randn
((
n_token
,
n_hidden
),
device
=
"cuda"
).
to
(
dtype
)
gating_output
=
torch
.
randn
((
n_token
,
n_expert
),
device
=
"cuda"
).
to
(
dtype
)
gating_output
=
torch
.
randn
((
n_token
,
n_expert
),
device
=
"cuda"
).
to
(
dtype
)
topk_weights
,
topk_ids
,
token_expert_indices
=
fused_topk
(
topk_weights
,
topk_ids
,
token_expert_indices
=
fused_topk
(
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment